diff --git a/.gitignore b/.gitignore
index daf719e6f..c89b827fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,8 @@ examples/*.pdf
 
 *.vtu
 *.vts
+*.png
+*.gif
 
 .cache
 
diff --git a/doc/conf.py b/doc/conf.py
index 9e5395862..a9dccf5b3 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -35,6 +35,7 @@
 }
 
 nitpick_ignore_regex = [
+    ["py:class", r".*_ProxyNeighborEvaluationResult"],
     # Sphinx started complaining about these in 8.2.1(-ish)
     # -AK, 2025-02-24
     ["py:class", r"TypeAliasForwardRef"],
@@ -70,13 +71,13 @@
     "cl_array.Array": "obj:pyopencl.array.Array",
     # pymbolic
     "ArithmeticExpression": "obj:pymbolic.ArithmeticExpression",
+    "ArithmeticExpressionContainerTc":
+        "obj:pymbolic.typing.ArithmeticExpressionContainerTc",
     "Expression": "obj:pymbolic.typing.Expression",
     "MultiVector": "obj:pymbolic.geometric_algebra.MultiVector",
     "Variable": "class:pymbolic.primitives.Variable",
     "prim.Subscript": "class:pymbolic.primitives.Subscript",
     "prim.Variable": "class:pymbolic.primitives.Variable",
-    "ArithmeticExpressionContainerTc":
-        "obj:pymbolic.typing.ArithmeticExpressionContainerTc",
     # arraycontext
     "ArrayContainer": "obj:arraycontext.ArrayContainer",
     "ArrayOrContainerOrScalar": "obj:arraycontext.ArrayOrContainerOrScalar",
@@ -91,6 +92,7 @@
     # boxtree
     "FromSepSmallerCrit": "obj:boxtree.traversal.FromSepSmallerCrit",
     "TimingResult": "class:boxtree.timing.TimingResult",
+    "Tree": "obj:boxtree.tree.Tree",
     "TreeKind": "obj:boxtree.tree_build.TreeKind",
     # sumpy
     "ExpansionBase": "class:sumpy.expansion.ExpansionBase",
@@ -114,11 +116,10 @@
     "Side": "obj:pytential.symbolic.primitives.Side",
     "TargetOrDiscretization": "obj:pytential.target.TargetOrDiscretization",
     "VectorExpression": "obj:pytential.symbolic.pde.scalar.VectorExpression",
-    "pytential.symbolic.dof_desc.DOFDescriptorLike":
-        "data:pytential.symbolic.dof_desc.DOFDescriptorLike",
-    "pytential.symbolic.primitives.ExpressionNode":
-        "class:pytential.symbolic.primitives.ExpressionNode",
+    "pytential.symbolic.dof_desc.DOFDescriptorLike": "data:pytential.symbolic.dof_desc.DOFDescriptorLike",  # noqa: E501
+    "pytential.symbolic.primitives.ExpressionNode": "class:pytential.symbolic.primitives.ExpressionNode",  # noqa: E501
     "sym.DOFDescriptor": "class:pytential.symbolic.dof_desc.DOFDescriptor",
+    "sym.DOFDescriptorLike": "obj:pytential.symbolic.dof_desc.DOFDescriptorLike",
     "sym.IntG": "class:pytential.symbolic.primitives.IntG",
     "sym.var": "obj:pytential.symbolic.primitives.var",
 }
diff --git a/doc/linalg.rst b/doc/linalg.rst
index de26425a0..97f12acbf 100644
--- a/doc/linalg.rst
+++ b/doc/linalg.rst
@@ -32,6 +32,7 @@ Low-level Functionality
     All the classes and routines in this module are experimental and the
     API can change at any point.
 
+.. automodule:: pytential.linalg.cluster
 .. automodule:: pytential.linalg.proxy
 .. automodule:: pytential.linalg.skeletonization
 
diff --git a/examples/scaling-study-hmatrix.py b/examples/scaling-study-hmatrix.py
new file mode 100644
index 000000000..a66f82111
--- /dev/null
+++ b/examples/scaling-study-hmatrix.py
@@ -0,0 +1,199 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+from dataclasses import dataclass
+
+import numpy as np
+
+from meshmode.array_context import PyOpenCLArrayContext
+from pytools.convergence import EOCRecorder
+
+from pytential import GeometryCollection, sym
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class Timings:
+    build: float
+    matvec: float
+
+
+def run_hmatrix_matvec(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection, *,
+        dofdesc: sym.DOFDescriptor) -> None:
+    from sumpy.kernel import LaplaceKernel
+    kernel = LaplaceKernel(places.ambient_dim)
+    sym_u = sym.var("u")
+    sym_op = -0.5 * sym_u + sym.D(kernel, sym_u, qbx_forced_limit="avg")
+
+    density_discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+    u = actx.thaw(density_discr.nodes()[0])
+
+    def build_hmat():
+        from pytential.linalg.hmatrix import build_hmatrix_by_proxy
+        return build_hmatrix_by_proxy(
+            actx, places, sym_op, sym_u,
+            domains=[dofdesc],
+            context={},
+            auto_where=dofdesc,
+            id_eps=1.0e-10,
+            _tree_kind="adaptive-level-restricted",
+            _approx_nproxy=64,
+            _proxy_radius_factor=1.15).get_forward()
+
+    # warmup
+    from pytools import ProcessTimer
+    with ProcessTimer() as pt:
+        hmat = build_hmat()
+        actx.queue.finish()
+
+    logger.info("build(warmup): %s", pt)
+
+    # build
+    with ProcessTimer() as pt:
+        hmat = build_hmat()
+        actx.queue.finish()
+
+    t_build = pt.wall_elapsed
+    logger.info("build: %s", pt)
+
+    # matvec
+    with ProcessTimer() as pt:
+        du = hmat @ u
+        assert du is not None
+        actx.queue.finish()
+
+    t_matvec = pt.wall_elapsed
+    logger.info("matvec: %s", pt)
+
+    return Timings(t_build, t_matvec)
+
+
+def run_scaling_study(
+        ambient_dim: int, *,
+        target_order: int = 4,
+        source_ovsmp: int = 4,
+        qbx_order: int = 4,
+        ) -> None:
+    dd = sym.DOFDescriptor(f"d{ambient_dim}", discr_stage=sym.QBX_SOURCE_STAGE2)
+
+    import pyopencl as cl
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue)
+
+    eoc_build = EOCRecorder()
+    eoc_matvec = EOCRecorder()
+
+    import meshmode.discretization.poly_element as mpoly
+    import meshmode.mesh.generation as mgen
+
+    resolutions = [64, 128, 256, 512, 1024, 1536, 2048, 2560, 3072]
+
+    for n in resolutions:
+        mesh = mgen.make_curve_mesh(
+            mgen.NArmedStarfish(5, 0.25),
+            np.linspace(0, 1, n),
+            order=target_order)
+
+        from meshmode.discretization import Discretization
+        pre_density_discr = Discretization(actx, mesh,
+            mpoly.InterpolatoryQuadratureGroupFactory(target_order))
+
+        from pytential.qbx import QBXLayerPotentialSource
+        qbx = QBXLayerPotentialSource(
+            pre_density_discr,
+            fine_order=source_ovsmp * target_order,
+            qbx_order=qbx_order,
+            fmm_order=False, fmm_backend=None,
+            )
+        places = GeometryCollection(qbx, auto_where=dd.geometry)
+        density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+
+        logger.info("ndofs:     %d", density_discr.ndofs)
+        logger.info("nelements: %d", density_discr.mesh.nelements)
+
+        timings = run_hmatrix_matvec(actx, places, dofdesc=dd)
+        eoc_build.add_data_point(density_discr.ndofs, timings.build)
+        eoc_matvec.add_data_point(density_discr.ndofs, timings.matvec)
+
+    for name, eoc in [("build", eoc_build), ("matvec", eoc_matvec)]:
+        logger.info("%s\n%s",
+            name, eoc.pretty_print(
+                abscissa_label="dofs",
+                error_label=f"{name} (s)",
+                abscissa_format="%d",
+                error_format="%.3fs",
+                eoc_format="%.2f",
+                )
+            )
+        visualize_eoc(f"scaling-study-hmatrix-{name}", eoc, 1)
+
+
+def visualize_eoc(
+        filename: str, eoc: EOCRecorder, order: int,
+        overwrite: bool = False) -> None:
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        logger.info("matplotlib not available for plotting")
+        return
+
+    fig = plt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    h, error = np.array(eoc.history).T  # type: ignore[no-untyped-call]
+    ax.loglog(h, error, "o-")
+
+    max_h = np.max(h)
+    min_e = np.min(error)
+    max_e = np.max(error)
+    min_h = np.exp(np.log(max_h) + np.log(min_e / max_e) / order)
+
+    ax.loglog(
+        [max_h, min_h], [max_e, min_e], "k-", label=rf"$\mathcal{{O}}(h^{order})$"
+    )
+
+    # }}}
+
+    ax.grid(True, which="major", linestyle="-", alpha=0.75)
+    ax.grid(True, which="minor", linestyle="--", alpha=0.5)
+
+    ax.set_xlabel("$N$")
+    ax.set_ylabel("$T~(s)$")
+
+    import pathlib
+    filename = pathlib.Path(filename)
+    if not overwrite and filename.exists():
+        raise FileExistsError(f"output file '{filename}' already exists")
+
+    fig.savefig(filename)
+    plt.close(fig)
+
+
+if __name__ == "__main__":
+    run_scaling_study(ambient_dim=2)
diff --git a/pytential/linalg/cluster.py b/pytential/linalg/cluster.py
new file mode 100644
index 000000000..b8a96c46e
--- /dev/null
+++ b/pytential/linalg/cluster.py
@@ -0,0 +1,595 @@
+from __future__ import annotations
+
+
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+import pathlib
+from dataclasses import dataclass, replace
+from functools import singledispatch
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+
+from arraycontext import PyOpenCLArrayContext
+from meshmode.discretization import Discretization
+from pytools import log_process, memoize_method, obj_array
+
+from pytential import GeometryCollection, sym
+from pytential.linalg.utils import IndexList, TargetAndSourceClusterList
+from pytential.qbx import QBXLayerPotentialSource
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    import optype.numpy as onp
+
+    from boxtree.tree import Tree
+    from boxtree.tree_build import TreeKind
+
+    from pytential.linalg.proxy import ProxyGenerator
+
+
+logger = logging.getLogger(__name__)
+
+__doc__ = """
+Clustering
+~~~~~~~~~~
+
+.. autoclass:: ClusterLevel
+.. autoclass:: ClusterTree
+
+.. autofunction:: split_array
+.. autofunction:: cluster
+.. autofunction:: uncluster
+
+.. autofunction:: partition_by_nodes
+"""
+
+# FIXME: this is just an arbitrary value
+_DEFAULT_MAX_PARTICLES_IN_BOX = 32
+
+
+# {{{ cluster tree
+
+
+def make_cluster_parent_map(
+        parent_ids: onp.Array1D[np.integer],
+    ) -> obj_array.ObjectArray1D[onp.Array1D[np.integer]]:
+    """Construct a parent map for :attr:`ClusterLevel.parent_map`."""
+    # NOTE: np.unique returns a sorted array
+    unique_parent_ids = np.unique(parent_ids)
+    ids = np.arange(parent_ids.size)
+
+    return obj_array.new_1d([
+        ids[parent_ids == unique_parent_ids[i]]
+        for i in range(unique_parent_ids.size)
+        ])
+
+
+@dataclass(frozen=True)
+class ClusterLevel:
+    """A level in a :class:`ClusterTree`.
+
+    .. autoattribute:: level
+    .. autoattribute:: box_ids
+    .. autoattribute:: parent_map
+    .. autoproperty:: nclusters
+    """
+
+    level: int
+    """Current level that is represented."""
+    box_ids: onp.Array1D[np.integer]
+    """Box IDs on the current level."""
+    parent_map: obj_array.ObjectArray1D[onp.Array1D[np.integer]]
+    """An object :class:`~numpy.ndarray` containing buckets of child indices,
+    i.e. ``parent_map[i]`` contains all the child indices that will cluster
+    into the same parent. Note that this indexing is local to this level
+    and is not related to the tree indexing stored by the :class:`ClusterTree`.
+    """
+
+    @property
+    def nclusters(self) -> int:
+        """Number of clusters on the current level (same as number of boxes
+        in :attr:`box_ids`).
+        """
+        return self.box_ids.size
+
+
+@dataclass(frozen=True)
+class ClusterTree:
+    r"""Hierarchical cluster representation.
+
+    .. autoattribute:: nlevels
+    .. autoattribute:: leaf_cluster_box_ids
+    .. autoattribute:: tree_cluster_parent_ids
+
+    .. autoproperty:: nclusters
+    .. autoproperty:: levels
+    .. automethod:: iter_levels
+    """
+
+    nlevels: int
+    """Total number of levels in the tree."""
+    leaf_cluster_box_ids: onp.Array1D[np.integer]
+    """Box IDs for each cluster on the leaf level of the tree."""
+    tree_cluster_parent_ids: onp.Array1D[np.integer]
+    """Parent box IDs for :attr:`leaf_cluster_box_ids`."""
+
+    # NOTE: only here to allow easier debugging + testing
+    _tree: Tree | None
+
+    @property
+    def nclusters(self) -> int:
+        """Number of clusters in the leaf level of the tree."""
+        return self.leaf_cluster_box_ids.size
+
+    @property
+    @memoize_method
+    def levels(self) -> obj_array.ObjectArray1D[ClusterLevel]:
+        r"""An :class:`~numpy.ndarray` of :class:`ClusterLevel`\ s."""
+        return obj_array.new_1d(list(self.iter_levels()))
+
+    def iter_levels(self) -> Iterator[ClusterLevel]:
+        """
+        :returns: an iterator over all the :class:`ClusterLevel` levels.
+        """
+
+        box_ids = self.leaf_cluster_box_ids
+        parent_ids = self.tree_cluster_parent_ids[box_ids]
+        clevel = ClusterLevel(
+            level=self.nlevels - 1,
+            box_ids=box_ids,
+            parent_map=make_cluster_parent_map(parent_ids),
+            )
+
+        for _ in range(self.nlevels - 1, -1, -1):
+            yield clevel
+
+            box_ids = np.unique(self.tree_cluster_parent_ids[clevel.box_ids])
+            parent_ids = self.tree_cluster_parent_ids[box_ids]
+            clevel = ClusterLevel(
+                level=clevel.level - 1,
+                box_ids=box_ids,
+                parent_map=make_cluster_parent_map(parent_ids)
+                )
+
+        assert clevel.nclusters == 1
+
+# }}}
+
+
+# {{{ cluster
+
+def split_array(x: onp.Array1D[Any],
+                index: IndexList) -> obj_array.ObjectArray1D[onp.Array1D[Any]]:
+    """
+    :returns: an object :class:`~numpy.ndarray` where each entry contains the
+        elements of the :math:`i`-th cluster in *index*.
+    """
+    assert x.size == index.nindices
+
+    return obj_array.new_1d([
+        index.cluster_take(x, i) for i in range(index.nclusters)
+        ])
+
+
+@singledispatch
+def cluster(obj: object, clevel: ClusterLevel) -> Any:
+    """Merge together elements of *obj* into their parent object, as described
+    by :attr:`ClusterLevel.parent_map`.
+    """
+    raise NotImplementedError(type(obj).__name__)
+
+
+@cluster.register(IndexList)
+def cluster_index_list(obj: IndexList, clevel: ClusterLevel) -> IndexList:
+    assert obj.nclusters == clevel.nclusters
+
+    if clevel.nclusters == 1:
+        return obj
+
+    from pytential.linalg.utils import make_index_list
+    indices = obj_array.new_1d([
+        np.concatenate([obj.cluster_indices(i) for i in ppm])
+        for ppm in clevel.parent_map
+        ])
+
+    return make_index_list(indices)
+
+
+@cluster.register(TargetAndSourceClusterList)
+def cluster_target_and_source_cluster_list(
+        obj: TargetAndSourceClusterList, clevel: ClusterLevel,
+        ) -> TargetAndSourceClusterList:
+    assert obj.nclusters == clevel.nclusters
+
+    if clevel.nclusters == 1:
+        return obj
+
+    return replace(obj,
+        targets=cluster(obj.targets, clevel),
+        sources=cluster(obj.sources, clevel))
+
+
+@cluster.register(np.ndarray)
+def cluster_ndarray(obj: obj_array.ObjectArray1D[onp.ArrayND[Any]],
+                    clevel: ClusterLevel) -> obj_array.ObjectArray1D[onp.ArrayND[Any]]:
+    assert obj.shape == (clevel.nclusters,)
+    if clevel.nclusters == 1:
+        return obj
+
+    def make_block(i: int, j: int):
+        if i == j:
+            return obj[i]
+
+        return np.zeros((obj[i].shape[0], obj[j].shape[1]), dtype=obj[i].dtype)
+
+    from pytools import single_valued
+    ndim = single_valued(block.ndim for block in obj)
+
+    if ndim == 1:
+        return obj_array.new_1d([
+            np.concatenate([obj[i] for i in ppm]) for ppm in clevel.parent_map
+            ])
+    elif ndim == 2:
+        return obj_array.new_1d([
+            np.block([[make_block(i, j) for j in ppm] for i in ppm])
+            for ppm in clevel.parent_map
+            ])
+    else:
+        raise ValueError(f"unsupported ndarray dimension: '{ndim}'")
+
+# }}}
+
+
+# {{{ uncluster
+
+def uncluster(ary: obj_array.ObjectArray1D[onp.Array1D[Any]],
+              index: IndexList,
+              clevel: ClusterLevel) -> obj_array.ObjectArray1D[onp.Array1D[Any]]:
+    """Performs the reverse of :func:`cluster` on object arrays.
+
+    :arg ary: an object :class:`~numpy.ndarray` with a shape that matches
+        :attr:`ClusterLevel.parent_map`.
+    :arg index: an :class:`~pytential.linalg.utils.IndexList` for the
+        current level, as given by :attr:`ClusterLevel.box_ids`.
+    :returns: an object :class:`~numpy.ndarray` with a shape that matches
+        :attr:`ClusterLevel.box_ids` of all the elements of *ary* that belong
+        to each child cluster.
+    """
+    assert ary.dtype.char == "O"
+    assert ary.shape == (clevel.parent_map.size,)
+
+    if index.nclusters == 1:
+        return ary
+
+    result: np.ndarray = np.empty(index.nclusters, dtype=object)
+    for ifrom, ppm in enumerate(clevel.parent_map):
+        offset = 0
+        for ito in ppm:
+            cluster_size = index.cluster_size(ito)
+            result[ito] = ary[ifrom][offset:offset + cluster_size]
+            offset += cluster_size
+
+        assert ary[ifrom].shape == (offset,)
+
+    return result
+
+# }}}
+
+
+# {{{ cluster generation
+
+def _build_binary_ish_tree_from_starts(starts: onp.Array1D[np.integer]) -> ClusterTree:
+    partition_box_ids = np.arange(starts.size - 1)
+    box_ids = partition_box_ids
+
+    box_parent_ids: list[onp.Array1D[np.integer]] = []
+    offset = box_ids.size
+    while box_ids.size > 1:
+        # NOTE: this is probably not the most efficient way to do it, but this
+        # code is mostly meant for debugging using a simple tree
+        clusters = np.array_split(box_ids, box_ids.size // 2)
+        parent_ids = offset + np.arange(len(clusters))
+        box_parent_ids.append(np.repeat(parent_ids, [len(c) for c in clusters]))
+
+        box_ids = parent_ids
+        offset += box_ids.size
+
+    # NOTE: make the root point to itself
+    box_parent_ids.append(np.array([offset - 1]))
+    nlevels = len(box_parent_ids)
+
+    return ClusterTree(
+        nlevels=nlevels,
+        leaf_cluster_box_ids=partition_box_ids,
+        tree_cluster_parent_ids=np.concatenate(box_parent_ids),
+        _tree=None)
+
+
+@log_process(logger)
+def partition_by_nodes(
+        actx: PyOpenCLArrayContext, places: GeometryCollection, *,
+        dofdesc: sym.DOFDescriptorLike | None = None,
+        tree_kind: TreeKind | None = "adaptive-level-restricted",
+        max_particles_in_box: int | None = None) -> tuple[IndexList, ClusterTree]:
+    """Generate equally sized ranges of nodes. The partition is created at the
+    lowest level of granularity, i.e. nodes. This results in balanced ranges
+    of points, but will split elements across different ranges.
+
+    :arg dofdesc: a :class:`~pytential.symbolic.dof_desc.DOFDescriptor` for
+        the geometry in *places* which should be partitioned.
+    :arg tree_kind: if not *None*, it is passed to :class:`boxtree.TreeBuilder`.
+    :arg max_particles_in_box: value used to control the number of points
+        in each partition (and thus the number of partitions). See the documentation
+        in :class:`boxtree.TreeBuilder`.
+    """
+    if dofdesc is None:
+        dofdesc = places.auto_source
+    dofdesc = sym.as_dofdesc(dofdesc)
+
+    if max_particles_in_box is None:
+        max_particles_in_box = _DEFAULT_MAX_PARTICLES_IN_BOX
+
+    lpot_source = places.get_geometry(dofdesc.geometry)
+    assert isinstance(lpot_source, Discretization | QBXLayerPotentialSource)
+
+    discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+    assert isinstance(discr, Discretization)
+
+    if tree_kind is not None:
+        setup_actx = lpot_source._setup_actx
+        assert isinstance(setup_actx, PyOpenCLArrayContext)
+
+        from pytential.qbx.utils import tree_code_container
+        tcc = tree_code_container(setup_actx)
+
+        from arraycontext import flatten
+        from meshmode.dof_array import DOFArray
+        tree, _ = tcc.build_tree()(actx.queue,
+                particles=flatten(
+                    actx.thaw(discr.nodes()), actx, leaf_class=DOFArray
+                    ),
+                max_particles_in_box=max_particles_in_box,
+                kind=tree_kind)
+        tree = tree.get(actx.queue)
+
+        # FIXME maybe this should use IS_LEAF once available?
+        from boxtree import box_flags_enum
+        assert tree.box_flags is not None
+        leaf_boxes, = (
+                tree.box_flags & box_flags_enum.HAS_SOURCE_OR_TARGET_CHILD_BOXES == 0
+                ).nonzero()
+
+        # FIXME: this annotation is not needed with numpy 2.0
+        indices = np.empty(len(leaf_boxes), dtype=object)
+        starts = None
+
+        for i, ibox in enumerate(leaf_boxes):
+            box_start = tree.box_source_starts[ibox]
+            box_end = box_start + tree.box_source_counts_cumul[ibox]
+            indices[i] = tree.user_source_ids[box_start:box_end]
+
+        ctree = ClusterTree(
+            nlevels=tree.nlevels,
+            leaf_cluster_box_ids=leaf_boxes,
+            tree_cluster_parent_ids=tree.box_parent_ids,
+            _tree=tree)
+    else:
+        if discr.ambient_dim != 2 and discr.dim == 1:
+            raise ValueError("only curves are supported for 'tree_kind=None'")
+
+        nclusters = max(discr.ndofs // max_particles_in_box, 2)
+        indices = np.arange(0, discr.ndofs, dtype=np.int64)
+        starts = np.linspace(0, discr.ndofs, nclusters + 1, dtype=np.int64)
+
+        # FIXME: mypy seems to be able to figure this out with numpy 2.0
+        assert starts is not None
+        assert starts[-1] == discr.ndofs
+
+        ctree = _build_binary_ish_tree_from_starts(starts)
+
+    from pytential.linalg import make_index_list
+    return make_index_list(indices, starts=starts), ctree
+
+# }}}
+
+
+# {{{ visualize clusters
+
+def visualize_clusters(actx: PyOpenCLArrayContext,
+                       generator: ProxyGenerator,
+                       srcindex: IndexList,
+                       tree: ClusterTree,
+                       filename: str | pathlib.Path, *,
+                       dofdesc: sym.DOFDescriptorLike = None,
+                       overwrite: bool = False) -> None:
+    filename = pathlib.Path(filename)
+
+    places = generator.places
+    if dofdesc is None:
+        dofdesc = places.auto_source
+    dofdesc = sym.as_dofdesc(dofdesc)
+
+    discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+    assert isinstance(discr, Discretization)
+
+    if discr.ambient_dim == 2:
+        _visualize_clusters_2d(actx, generator, discr, srcindex, tree, filename,
+                               dofdesc=dofdesc, overwrite=overwrite)
+    elif discr.ambient_dim == 2:
+        _visualize_clusters_3d(actx, generator, discr, srcindex, tree, filename,
+                               dofdesc=dofdesc, overwrite=overwrite)
+    else:
+        raise NotImplementedError(f"Unsupported dimension: {discr.ambient_dim}")
+
+
+def _visualize_clusters_2d(actx: PyOpenCLArrayContext,
+                           generator: ProxyGenerator,
+                           discr: Discretization,
+                           srcindex: IndexList,
+                           tree: ClusterTree,
+                           filename: pathlib.Path, *,
+                           dofdesc: sym.DOFDescriptor,
+                           overwrite: bool = False) -> None:
+    import matplotlib.pyplot as pt
+
+    from arraycontext import flatten
+    from boxtree.visualization import TreePlotter
+    from meshmode.dof_array import DOFArray
+
+    assert discr.ambient_dim == 2
+    x, y = actx.to_numpy(flatten(discr.nodes(), actx, leaf_class=DOFArray))
+    for clevel in tree.levels:
+        outfile = filename.with_stem(f"{filename.stem}-lvl{clevel.level:03d}")
+        if not overwrite and outfile.exists():
+            raise FileExistsError(f"Output file '{outfile}' already exists")
+
+        pxy = generator(actx, dofdesc, srcindex).to_numpy(actx)
+        pxycenters = pxy.centers
+        pxyradii = pxy.radii
+        clsradii = pxy.cluster_radii
+
+        fig = pt.figure()
+        ax = fig.gca()
+
+        plotter = TreePlotter(tree._tree)
+        plotter.set_bounding_box()
+        plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
+
+        ax.plot(x, y, "ko", ms=2.0)
+        for i in range(srcindex.nclusters):
+            isrc = srcindex.cluster_indices(i)
+            ax.plot(x[isrc], y[isrc], "o", ms=2.0)
+
+        from itertools import cycle
+        colors = cycle(pt.rcParams["axes.prop_cycle"].by_key()["color"])
+
+        for ppm in clevel.parent_map:
+            color = next(colors)
+            for j in ppm:
+                center = (pxycenters[0, j], pxycenters[1, j])
+                c = pt.Circle(center, pxyradii[j], color=color, alpha=0.1)
+                ax.add_artist(c)
+                c = pt.Circle(center, clsradii[j], color=color, alpha=0.1)
+                ax.add_artist(c)
+                ax.text(*center, f"{j}", fontsize=18)
+
+        ax.set_xlabel("$x$")
+        ax.set_ylabel("$y$")
+        ax.relim()
+        ax.autoscale()
+        ax.set_aspect("equal")
+
+        fig.savefig(outfile)
+        pt.close(fig)
+
+        srcindex = cluster(srcindex, clevel)
+
+
+def _visualize_clusters_3d(actx: PyOpenCLArrayContext,
+                           generator: ProxyGenerator,
+                           discr: Discretization,
+                           srcindex: IndexList,
+                           tree: ClusterTree,
+                           filename: pathlib.Path, *,
+                           dofdesc: sym.DOFDescriptor,
+                           overwrite: bool = False) -> None:
+    from arraycontext import unflatten
+    from meshmode.discretization.visualization import make_visualizer
+
+    # NOTE: This writes out one vtu file for each level that contains
+    #   * a mesh that's the union of `discr` and a sphere for each proxy ball
+    #   * marker: a marker on `discr` (NaN on the proxy balls) for each of the
+    #     clusters at the current level
+    #   * proxies: a marker on the proxy balls (NaN on `discr`)
+    #
+    # Not quite sure how to best visualize the whole geometry here, so the
+    # proposed workflow is to load the vtu file twice, set opacity to 0 for
+    # NaNs and set opacity to something small for the proxy balls.
+
+    # TODO:
+    #   * color proxy balls based on their parent so we can easily see how they
+    #    will cluster
+
+    assert discr.ambient_dim == 3
+    for clevel in tree.levels:
+        outfile = filename.with_stem(f"{filename.stem}-lvl{clevel.level:03d}")
+        outfile = outfile.with_suffix(".vtu")
+        if not overwrite and outfile.exists():
+            raise FileExistsError(f"Output file '{outfile}' already exists")
+
+        # construct proxy balls
+        pxy = generator(actx, dofdesc, srcindex).to_numpy(actx)
+        pxycenters = pxy.centers
+        pxyradii = pxy.radii
+        nclusters = srcindex.nclusters
+
+        # construct meshes for each proxy ball
+        from meshmode.mesh.generation import generate_sphere
+        from meshmode.mesh.processing import affine_map, merge_disjoint_meshes
+
+        ref_mesh = generate_sphere(1, 4, uniform_refinement_rounds=1)
+        pxymeshes = [
+            affine_map(ref_mesh, A=pxyradii[i], b=pxycenters[:, i].squeeze())
+            for i in range(nclusters)
+        ]
+
+        # merge meshes into a single discretization
+        from meshmode.discretization.poly_element import (
+            InterpolatoryEdgeClusteredGroupFactory,
+        )
+        pxymesh = merge_disjoint_meshes([discr.mesh, *pxymeshes])
+        pxydiscr = Discretization(actx, pxymesh,
+                                  InterpolatoryEdgeClusteredGroupFactory(4))
+
+        # add a marker field for all clusters
+        marker = np.full((pxydiscr.ndofs,), np.nan, dtype=np.float64)
+        template_ary = actx.thaw(pxydiscr.nodes()[0])
+
+        for i in range(srcindex.nclusters):
+            isrc = srcindex.cluster_indices(i)
+            marker[isrc] = 10.0 * (i + 1.0)
+        marker_dev = unflatten(template_ary, actx.from_numpy(marker), actx)
+
+        # add a marker field for all proxies
+        pxymarker = np.full((pxydiscr.ndofs,), np.nan, dtype=np.float64)
+        pxymarker[discr.ndofs:] = 1.0
+        pxymarker_dev = unflatten(template_ary, actx.from_numpy(pxymarker), actx)
+
+        # write it all out
+        vis = make_visualizer(actx, pxydiscr)
+        vis.write_vtk_file(str(outfile), [
+            ("marker", marker_dev),
+            ("proxies", pxymarker_dev),
+            ], overwrite=overwrite)
+
+        srcindex = cluster(srcindex, clevel)
+
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/direct_solver_symbolic.py b/pytential/linalg/direct_solver_symbolic.py
index 290f4048c..eb13a4946 100644
--- a/pytential/linalg/direct_solver_symbolic.py
+++ b/pytential/linalg/direct_solver_symbolic.py
@@ -76,12 +76,14 @@ def prepare_proxy_expr(
         places: GeometryCollection,
         exprs: Iterable[ArithmeticExpression],
         auto_where: tuple[DOFDescriptorLike, DOFDescriptorLike],
+        remove_transforms: bool = True,
     ) -> obj_array.ObjectArray1D[ArithmeticExpression]:
     def _prepare_expr(expr: ArithmeticExpression) -> ArithmeticExpression:
         # remove all diagonal / non-operator terms in the expression
         expr = IntGTermCollector()(expr)
         # ensure all IntGs remove all the kernel derivatives
-        expr = KernelTransformationRemover()(expr)
+        if remove_transforms:
+            expr = KernelTransformationRemover()(expr)
         # ensure all IntGs have their source and targets set
         expr = DOFDescriptorReplacer(
                                      default_source=auto_where[0],
diff --git a/pytential/linalg/hmatrix.py b/pytential/linalg/hmatrix.py
new file mode 100644
index 000000000..7bc97012b
--- /dev/null
+++ b/pytential/linalg/hmatrix.py
@@ -0,0 +1,590 @@
+from __future__ import annotations
+
+
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import numpy.linalg as la
+from scipy.sparse.linalg import LinearOperator
+
+from arraycontext import ArrayOrContainerT, PyOpenCLArrayContext, flatten, unflatten
+from meshmode.dof_array import DOFArray
+from pytools import ProcessLogger, log_process, obj_array
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from numpy.typing import NDArray
+
+    from pytential import GeometryCollection, sym
+    from pytential.linalg.cluster import ClusterLevel, ClusterTree
+    from pytential.linalg.proxy import ProxyGeneratorBase
+    from pytential.linalg.skeletonization import (
+        SkeletonizationResult,
+        SkeletonizationWrangler,
+    )
+    from pytential.linalg.utils import IndexList, TargetAndSourceClusterList
+
+logger = logging.getLogger(__name__)
+
+
+__doc__ = """
+Hierarical Matrix Construction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ProxyHierarchicalMatrixWrangler
+.. autoclass:: ProxyHierarchicalMatrix
+.. autoclass:: ProxyHierarchicalForwardMatrix
+.. autoclass:: ProxyHierarchicalBackwardMatrix
+
+.. autofunction:: build_hmatrix_by_proxy
+"""
+
+
+# {{{ error model
+
+def hmatrix_error_from_param(
+        ambient_dim: int,
+        *,
+        id_eps: float,
+        id_rank: int,
+        min_proxy_radius: float,
+        max_cluster_radius: float,
+        nproxies: int,
+        nsources: int,
+        ntargets: int,
+        c: float = 1.0e-3) -> float:
+    import math
+
+    # FIXME: This is horribly out of date right now. Need to get the updated version
+    # from https://github.com/alexfikl/qbx-ds-paper-experiments
+    if ambient_dim == 2:
+        p = int(0.5 * id_rank)
+    elif ambient_dim == 3:
+        p = int((math.sqrt(1 + 4 * id_rank) - 1) / 2)
+    else:
+        raise ValueError(f"unsupported ambient dimension: '{ambient_dim}'")
+
+    rho = alpha = max_cluster_radius / min_proxy_radius
+    return float(
+        c * rho ** (p + 1) / (1 - rho)
+        + math.sqrt(nsources / nproxies)
+        * (1 - alpha ** (p + 1)) / (1 - alpha) * id_eps
+        )
+
+# }}}
+
+
+# {{{ update diagonals
+
+def _update_skeleton_diagonal(
+        skeleton: SkeletonizationResult,
+        parent: SkeletonizationResult | None,
+        clevel: ClusterLevel | None,
+        diagonal: NDArray[np.inexact] | None = None) -> SkeletonizationResult:
+    """Due to the evaluation in :func:`_skeletonize_block_by_proxy_with_mats`,
+    the diagonal matrix in *skeleton* also contains the indices from its
+    parent. In particular, at a level :math:`l` we need the diagonal block::
+
+        0               D_{i, j + 1}        D_{i, j + 2}
+        D_{i + 1, j}    0                   D_{i + 1, j + 2}
+        D_{i + 2, j}    D_{i + 2, j + 1}    0
+
+    but the version in *skeleton* also fills in the 0 blocks in there. This
+    routine goes through them and zeros them out.
+    """
+
+    if clevel is None:
+        return skeleton
+
+    assert parent is not None
+    assert skeleton.tgt_src_index.shape == parent.skel_tgt_src_index.shape
+
+    if diagonal is None:
+        diagonal = np.zeros(parent.nclusters)
+
+    from numbers import Number
+    if isinstance(diagonal, Number):
+        diagonal = np.full(parent.nclusters, diagonal, dtype=skeleton.dtype)
+
+    assert diagonal.size == parent.nclusters
+    targets, sources = parent.skel_tgt_src_index
+
+    # FIXME: nicer way to do this?
+    mat = np.empty(skeleton.nclusters, dtype=object)
+    for k in range(skeleton.nclusters):
+        D = skeleton.D[k].copy()
+
+        i = j = 0
+        for icluster in clevel.parent_map[k]:
+            di = targets.cluster_size(icluster)
+            dj = sources.cluster_size(icluster)
+            D[np.s_[i:i + di], np.s_[j:j + dj]] = diagonal[icluster]
+
+            i += di
+            j += dj
+
+        assert D.shape == (i, j)
+        mat[k] = D
+
+    from dataclasses import replace
+    return replace(skeleton, D=mat)
+
+
+@log_process(logger)
+def _update_skeletons_diagonal(
+        wrangler: ProxyHierarchicalMatrixWrangler,
+        forward: bool = True,
+        ) -> NDArray[np.inexact]:
+    skeletons = np.empty(wrangler.skeletons.shape, dtype=object)
+    skeletons[0] = wrangler.skeletons[0]
+
+    for i in range(1, wrangler.ctree.nlevels):
+        diagonal = None if forward else skeletons[i - 1].Dhat
+
+        skeletons[i] = _update_skeleton_diagonal(
+            wrangler.skeletons[i],
+            wrangler.skeletons[i - 1],
+            wrangler.ctree.levels[i - 1],
+            diagonal=diagonal)
+
+    return skeletons
+
+# }}}
+
+
+# {{{ ProxyHierarchicalMatrix
+
+@dataclass(frozen=True)
+class ProxyHierarchicalMatrixWrangler:
+    """
+    .. automethod:: get_forward
+    .. automethod:: get_backward
+    """
+
+    wrangler: SkeletonizationWrangler
+    proxy: ProxyGeneratorBase
+    ctree: ClusterTree
+    skeletons: obj_array.ObjectArray1D[SkeletonizationResult]
+
+    @property
+    def tgt_src_index(self) -> TargetAndSourceClusterList:
+        return self.skeletons[0].tgt_src_index
+
+    def get_forward(self) -> ProxyHierarchicalForwardMatrix:
+        return ProxyHierarchicalForwardMatrix(
+            ctree=self.ctree,
+            skeletons=_update_skeletons_diagonal(self, forward=True),
+            )
+
+    def get_backward(self) -> ProxyHierarchicalBackwardMatrix:
+        return ProxyHierarchicalBackwardMatrix(
+            ctree=self.ctree,
+            skeletons=_update_skeletons_diagonal(self, forward=False))
+
+
+@dataclass(frozen=True)
+class ProxyHierarchicalMatrix(LinearOperator):
+    """
+    .. autoattribute:: ctree
+    .. autoattribute:: skeletons
+
+    This class implements the :class:`scipy.sparse.linalg.LinearOperator`
+    interface. In particular, the following attributes and methods:
+
+    .. autoproperty:: shape
+    .. autoproperty:: dtype
+
+    .. automethod:: matvec
+    .. automethod:: __matmul__
+    """
+
+    ctree: ClusterTree
+    """A tree structure that describes the hierarchy of the solver."""
+    skeletons: obj_array.ObjectArray1D[SkeletonizationResult]
+    """An :class:`~numpy.ndarray` containing skeletonization information
+    for each level of the hierarchy. For additional details, see
+    :class:`~pytential.linalg.skeletonization.SkeletonizationResult`.
+    """
+
+    @property
+    def shape(self) -> tuple[int, int]:
+        """A :class:`tuple` that gives the size of the skeletonized operator."""
+        return self.skeletons[0].tgt_src_index.shape
+
+    @property
+    def dtype(self) -> np.dtype[np.inexact]:
+        """The :class:`numpy.dtype` of the skeletonized operator."""
+        # FIXME: assert that everyone has this dtype?
+        return self.skeletons[0].R[0].dtype
+
+    @property
+    def nlevels(self) -> int:
+        return self.skeletons.size
+
+    @property
+    def nclusters(self) -> int:
+        return self.skeletons[0].nclusters
+
+    def __matmul__(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        """Same as :meth:`_matvec`."""
+        return self._matvec(x)
+
+    def _matmat(self, mat):
+        raise NotImplementedError
+
+    def _adjoint(self, x):
+        raise NotImplementedError
+
+# }}}
+
+
+# {{{ forward
+
+@dataclass(frozen=True)
+class ProxyHierarchicalForwardMatrix(ProxyHierarchicalMatrix):
+    def _matvec(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        if isinstance(x, DOFArray):
+            from arraycontext import get_container_context_recursively_opt
+            actx = get_container_context_recursively_opt(x)
+            if actx is None:
+                raise ValueError("input array is frozen")
+
+            ary = actx.to_numpy(flatten(x, actx))
+        elif isinstance(x, np.ndarray) and x.dtype.char != "O":
+            ary = x
+        else:
+            raise TypeError(f"unsupported input type: {type(x)}")
+
+        assert actx is None or isinstance(actx, PyOpenCLArrayContext)
+        result = apply_skeleton_forward_matvec(self, ary)
+
+        if isinstance(x, DOFArray):
+            assert actx is not None
+            result = unflatten(x, actx.from_numpy(result), actx)
+
+        return result
+
+
+@log_process(logger)
+def apply_skeleton_forward_matvec(
+        hmat: ProxyHierarchicalMatrix,
+        ary: ArrayOrContainerT,
+        ) -> ArrayOrContainerT:
+    from pytential.linalg.cluster import split_array
+    targets, sources = hmat.skeletons[0].tgt_src_index
+    x = split_array(ary, sources)   # type: ignore[arg-type]
+
+    # NOTE: this computes a telescoping product of the form
+    #
+    #   A x_0 = (D0 + L0 (D1 + L1 (...) R1) R0) x_0
+    #
+    # with arbitrary numbers of levels. When recursing down, we compute
+    #
+    #   x_{k + 1} = R_k x_k
+    #   z_{k + 1} = D_k x_k
+    #
+    # and, at the root level, we have
+    #
+    #   x_{N + 1} = z_{N + 1} = D_N x_N.
+    #
+    # When recursing back up, we take `b_{N + 1} = x_{N + 1}` and
+    #
+    #   b_{k - 1} = z_k + L_k b_k
+    #
+    # which gives back the desired product when we reach the leaf level again.
+
+    d_dot_x = np.empty(hmat.nlevels, dtype=object)
+
+    # {{{ recurse down
+
+    from pytential.linalg.cluster import cluster
+
+    with ProcessLogger(logger, "apply_skeleton_forward_matvec (compress)"):
+        for k, clevel in enumerate(hmat.ctree.levels):
+            skeleton = hmat.skeletons[k]
+            assert x.shape == (skeleton.nclusters,)
+            assert skeleton.tgt_src_index.shape[1] == sum(xi.size for xi in x)
+
+            d_dot_x_k = np.empty(skeleton.nclusters, dtype=object)
+            r_dot_x_k = np.empty(skeleton.nclusters, dtype=object)
+
+            for i in range(skeleton.nclusters):
+                r_dot_x_k[i] = skeleton.R[i] @ x[i]
+                d_dot_x_k[i] = skeleton.D[i] @ x[i]
+
+            d_dot_x[k] = d_dot_x_k
+            x = cluster(r_dot_x_k, clevel)
+
+    # }}}
+
+    # {{{ root
+
+    # NOTE: at root level, we just multiply with the full diagonal
+    b = d_dot_x[hmat.nlevels - 1]
+    assert b.shape == (1,)
+
+    # }}}
+
+    # {{{ recurse up
+
+    from pytential.linalg.cluster import uncluster
+
+    with ProcessLogger(logger, "apply_skeleton_forward_matvec (inflate)"):
+        for k, clevel in reversed(list(enumerate(hmat.ctree.levels[:-1]))):
+            skeleton = hmat.skeletons[k]
+            d_dot_x_k = d_dot_x[k]
+            assert d_dot_x_k.shape == (skeleton.nclusters,)
+
+            b = uncluster(b, skeleton.skel_tgt_src_index.targets, clevel)
+            for i in range(skeleton.nclusters):
+                b[i] = d_dot_x_k[i] + skeleton.L[i] @ b[i]
+
+    assert b.shape == (hmat.nclusters,)
+
+    # }}}
+
+    return np.concatenate(b)[np.argsort(targets.indices)]
+
+# }}}
+
+
+# {{{ backward
+
+@dataclass(frozen=True)
+class ProxyHierarchicalBackwardMatrix(ProxyHierarchicalMatrix):
+    def _matvec(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        if isinstance(x, DOFArray):
+            from arraycontext import get_container_context_recursively_opt
+            actx = get_container_context_recursively_opt(x)
+            if actx is None:
+                raise ValueError("input array is frozen")
+
+            ary = actx.to_numpy(flatten(x, actx))
+        elif isinstance(x, np.ndarray) and x.dtype.char != "O":
+            ary = x
+        else:
+            raise TypeError(f"unsupported input type: {type(x)}")
+
+        assert actx is None or isinstance(actx, PyOpenCLArrayContext)
+        result = apply_skeleton_backward_matvec(actx, self, ary)
+
+        if isinstance(x, DOFArray):
+            assert actx is not None
+            result = unflatten(x, actx.from_numpy(result), actx)
+
+        return result
+
+
+@log_process(logger)
+def apply_skeleton_backward_matvec(
+        actx: PyOpenCLArrayContext | None,
+        hmat: ProxyHierarchicalMatrix,
+        ary: ArrayOrContainerT,
+        ) -> ArrayOrContainerT:
+    from pytential.linalg.cluster import split_array
+    targets, sources = hmat.skeletons[0].tgt_src_index
+
+    b = split_array(ary, targets)   # type: ignore[arg-type]
+    r_dot_b = np.empty(hmat.nlevels, dtype=object)
+
+    # {{{ recurse down
+
+    # NOTE: this solves a telescoping product of the form
+    #
+    #   A x_0 = (D0 + L0 (D1 + L1 (...) R1) R0) x_0 = b_0
+    #
+    # with arbitrary numbers of levels. When recursing down, we compute
+    #
+    #   b_{k + 1} = \hat{D}_k R_k D_k^{-1} b_k
+    #   \hat{D}_k = (R_k D_k^{-1} L_k)^{-1}
+    #
+    # and, at the root level, we solve
+    #
+    #   D_N x_N = b_N.
+    #
+    # When recursing back up, we take `b_{N + 1} = x_{N + 1}` and
+    #
+    #   x_{k} = D_k^{-1} (b_k - L_k b_{k + 1} + L_k \hat{D}_k x_{k + 1})
+    #
+    # which gives back the desired product when we reach the leaf level again.
+
+    from pytential.linalg.cluster import cluster
+
+    with ProcessLogger(logger, "apply_skeleton_backward_matvec (compress)"):
+        for k, clevel in enumerate(hmat.ctree.levels):
+            skeleton = hmat.skeletons[k]
+            assert b.shape == (skeleton.nclusters,)
+            assert skeleton.tgt_src_index.shape[0] == sum(bi.size for bi in b)
+
+            dhat_dot_b_k = np.empty(skeleton.nclusters, dtype=object)
+            for i in range(skeleton.nclusters):
+                dhat_dot_b_k[i] = (
+                    skeleton.Dhat[i] @ (skeleton.R[i] @ (skeleton.invD[i] @ b[i]))
+                    )
+
+            r_dot_b[k] = b
+            b = cluster(dhat_dot_b_k, clevel)
+
+    # }}}
+
+    # {{{ root
+
+    assert b.shape == (1,)
+
+    with ProcessLogger(logger,
+                       f"apply_skeleton_backward_matvec (root solve: {b[0].size}): "):
+        x = obj_array.new_1d([
+            la.solve(D, bi) for D, bi in zip(hmat.skeletons[-1].D, b, strict=True)
+            ])
+
+    # }}}
+
+    # {{{ recurse up
+
+    from pytential.linalg.cluster import uncluster
+
+    with ProcessLogger(logger, "apply_skeleton_backward_matvec (inflate)"):
+        for k, clevel in reversed(list(enumerate(hmat.ctree.levels[:-1]))):
+            skeleton = hmat.skeletons[k]
+            b0 = r_dot_b[k]
+            b1 = r_dot_b[k + 1]
+            assert b0.shape == (skeleton.nclusters,)
+
+            x = uncluster(x, skeleton.skel_tgt_src_index.sources, clevel)
+            b1 = uncluster(b1, skeleton.skel_tgt_src_index.targets, clevel)
+
+            for i in range(skeleton.nclusters):
+                sx = b1[i] - skeleton.Dhat[i] @ x[i]
+                x[i] = skeleton.invD[i] @ (b0[i] - skeleton.L[i] @ sx)
+
+    assert x.shape == (hmat.nclusters,)
+
+    # }}}
+
+    return np.concatenate(x)[np.argsort(sources.indices)]
+
+# }}}
+
+
+# {{{ build_hmatrix_by_proxy
+
+def build_hmatrix_by_proxy(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection,
+        exprs: sym.Expression | Sequence[sym.Expression],
+        input_exprs: sym.Variable | Sequence[sym.Variable], *,
+        auto_where: sym.DOFDescriptorLike | None = None,
+        domains: Sequence[sym.DOFDescriptorLike] | None = None,
+        context: dict[str, Any] | None = None,
+        id_eps: float = 1.0e-8,
+        rng: np.random.Generator | None = None,
+
+        # NOTE: these are dev variables and can disappear at any time!
+        _tree_kind: str | None = "adaptive-level-restricted",
+        _weighted_proxy: bool | tuple[bool, bool] | None = None,
+
+        # TODO: plugin in error model to get an estimate for:
+        #   * how many points we want per cluster?
+        #   * how many proxy points we want?
+        #   * how far away should the proxy points be?
+        # based on id_eps. How many of these should be user tunable?
+        _max_particles_in_box: int | None = None,
+        _approx_nproxy: int | None = None,
+        _proxy_radius_factor: float | None = None,
+        ) -> ProxyHierarchicalMatrixWrangler:
+    from pytential.linalg.skeletonization import make_skeletonization_wrangler
+    from pytential.symbolic.matrix import P2PClusterMatrixBuilder
+
+    def P2PClusterMatrixBuilderWithDiagonal(*args, **kwargs):
+        kwargs["exclude_self"] = True
+        return P2PClusterMatrixBuilder(*args, **kwargs)
+
+    wrangler = make_skeletonization_wrangler(
+            places, exprs, input_exprs,
+            domains=domains, context=context, auto_where=auto_where,
+            _weighted_proxy=_weighted_proxy,
+            # _remove_source_transforms=True,
+            # _neighbor_cluster_builder=P2PClusterMatrixBuilderWithDiagonal,
+            # _proxy_source_cluster_builder=P2PClusterMatrixBuilder,
+            # _proxy_target_cluster_builder=P2PClusterMatrixBuilder,
+            )
+
+    if wrangler.nrows != 1 or wrangler.ncols != 1:
+        raise ValueError("multi-block operators are not supported")
+
+    from pytential.linalg.proxy import QBXProxyGenerator
+    proxy = QBXProxyGenerator(places,
+            approx_nproxy=_approx_nproxy,
+            radius_factor=_proxy_radius_factor)
+
+    from pytential.linalg.cluster import partition_by_nodes
+    cluster_index, ctree = partition_by_nodes(
+        actx, places,
+        dofdesc=wrangler.domains[0],
+        tree_kind=_tree_kind,
+        max_particles_in_box=_max_particles_in_box)
+
+    logger.info("tree levels: %d", ctree.nlevels)
+    logger.info("cluster count: %d", cluster_index.nclusters)
+    logger.info("leaf cluster sizes: %s", [
+        # NOTE: making into a list so that they all get printed
+        int(s) for s in np.diff(cluster_index.starts)
+        ])
+
+    from pytential.linalg.utils import TargetAndSourceClusterList
+    tgt_src_index = TargetAndSourceClusterList(
+        targets=cluster_index, sources=cluster_index)
+
+    from pytential.linalg.skeletonization import rec_skeletonize_by_proxy
+    skeletons = rec_skeletonize_by_proxy(
+        actx, places, ctree, tgt_src_index, exprs, input_exprs,
+        id_eps=id_eps,
+        rng=rng,
+        max_particles_in_box=_max_particles_in_box,
+        _proxy=proxy,
+        _wrangler=wrangler,
+        )
+
+    if __debug__:
+        def _get_cluster_avg_size(idx: IndexList) -> str:
+            d = np.diff(idx.starts)
+            return f"{np.mean(d):.2f} ± {np.std(d):.2f}"
+
+        logger.info("avg cluster size: %s", " ".join(
+            _get_cluster_avg_size(sk.tgt_src_index.sources)
+            for sk in skeletons
+            ))
+
+    return ProxyHierarchicalMatrixWrangler(
+        wrangler=wrangler, proxy=proxy, ctree=ctree, skeletons=skeletons
+        )
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/proxy.py b/pytential/linalg/proxy.py
index 964698a8e..338d971c8 100644
--- a/pytential/linalg/proxy.py
+++ b/pytential/linalg/proxy.py
@@ -36,7 +36,7 @@
 from arraycontext import Array, ArrayContainer, PyOpenCLArrayContext, flatten
 from meshmode.discretization import Discretization
 from meshmode.dof_array import DOFArray
-from pytools import memoize_in
+from pytools import log_process, memoize_in
 
 from pytential import GeometryCollection, bind, sym
 from pytential.qbx import QBXLayerPotentialSource
@@ -49,7 +49,6 @@
 
     import optype.numpy as onp
 
-    from boxtree.tree_build import TreeKind
     from sumpy.expansion import ExpansionBase
     from sumpy.kernel import Kernel
 
@@ -59,7 +58,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 __doc__ = """
 Proxy Point Generation
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -74,7 +72,6 @@
 .. autoclass:: QBXProxyGenerator
     :show-inheritance:
 
-.. autofunction:: partition_by_nodes
 .. autofunction:: gather_cluster_neighbor_points
 """
 
@@ -82,82 +79,6 @@
 _DEFAULT_MAX_PARTICLES_IN_BOX = 32
 
 
-# {{{ point index partitioning
-
-def partition_by_nodes(
-        actx: PyOpenCLArrayContext,
-        places: GeometryCollection, *,
-        dofdesc: DOFDescriptorLike | None = None,
-        tree_kind: TreeKind | None = "adaptive-level-restricted",
-        max_particles_in_box: int | None = None) -> IndexList:
-    """Generate equally sized ranges of nodes. The partition is created at the
-    lowest level of granularity, i.e. nodes. This results in balanced ranges
-    of points, but will split elements across different ranges.
-
-    :arg dofdesc: a :class:`~pytential.symbolic.dof_desc.DOFDescriptor` for
-        the geometry in *places* which should be partitioned.
-    :arg tree_kind: if not *None*, it is passed to :class:`boxtree.TreeBuilder`.
-    :arg max_particles_in_box: value used to control the number of points
-        in each partition (and thus the number of partitions). See the documentation
-        in :class:`boxtree.TreeBuilder`.
-    """
-    if dofdesc is None:
-        dofdesc = places.auto_source
-    dofdesc = sym.as_dofdesc(dofdesc)
-
-    if max_particles_in_box is None:
-        max_particles_in_box = _DEFAULT_MAX_PARTICLES_IN_BOX
-
-    from pytential.source import LayerPotentialSourceBase
-
-    lpot_source = places.get_geometry(dofdesc.geometry)
-    assert isinstance(lpot_source, LayerPotentialSourceBase)
-
-    discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
-    assert isinstance(discr, Discretization)
-
-    if tree_kind is not None:
-        from pytential.qbx.utils import tree_code_container
-        tcc = tree_code_container(lpot_source._setup_actx)
-
-        tree, _ = tcc.build_tree()(actx.queue,
-                particles=flatten(
-                    actx.thaw(discr.nodes()), actx, leaf_class=DOFArray
-                    ),
-                max_particles_in_box=max_particles_in_box,
-                kind=tree_kind)
-
-        from boxtree import box_flags_enum
-        tree = tree.get(actx.queue)
-        # FIXME: maybe this should use IS_LEAF once available?
-        assert tree.box_flags is not None
-        leaf_boxes, = (
-                tree.box_flags & box_flags_enum.HAS_SOURCE_OR_TARGET_CHILD_BOXES == 0
-                ).nonzero()
-
-        indices = np.empty(len(leaf_boxes), dtype=object)
-        starts: onp.Array1D[np.integer] | None = None
-
-        for i, ibox in enumerate(leaf_boxes):
-            box_start = tree.box_source_starts[ibox]
-            box_end = box_start + tree.box_source_counts_cumul[ibox]
-            indices[i] = tree.user_source_ids[box_start:box_end]
-    else:
-        if discr.ambient_dim != 2 and discr.dim == 1:
-            raise ValueError("only curves are supported for 'tree_kind=None'")
-
-        nclusters = max(discr.ndofs // max_particles_in_box, 2)
-        indices = np.arange(0, discr.ndofs, dtype=np.int64)
-        starts = np.linspace(0, discr.ndofs, nclusters + 1, dtype=np.int64)
-
-        assert starts[-1] == discr.ndofs
-
-    from pytential.linalg.utils import make_index_list
-    return make_index_list(indices, starts=starts)
-
-# }}}
-
-
 # {{{ proxy points
 
 class ProxyPointSource(PointPotentialSource):
@@ -231,6 +152,7 @@ class ProxyClusterGeometryData:
     """
 
     places: GeometryCollection
+    """Geometry collection containing the used :attr:`dofdesc`."""
     dofdesc: sym.DOFDescriptor
     """A descriptor for the geometry used to compute the proxy points."""
 
@@ -469,6 +391,7 @@ def get_centers_kernel_ex(self, actx: PyOpenCLArrayContext) -> lp.ExecutorBase:
     def get_radii_kernel_ex(self, actx: PyOpenCLArrayContext) -> lp.ExecutorBase:
         pass
 
+    @log_process(logger)
     def __call__(self,
             actx: PyOpenCLArrayContext,
             source_dd: DOFDescriptorLike | None,
@@ -655,6 +578,7 @@ def get_radii_kernel_ex(self, actx: PyOpenCLArrayContext) -> lp.ExecutorBase:
         return make_compute_cluster_qbx_radii_kernel_ex(actx, self.ambient_dim)
 
     @override
+    @log_process(logger)
     def __call__(self,
             actx: PyOpenCLArrayContext,
             source_dd: DOFDescriptorLike | None,
@@ -688,6 +612,7 @@ def __call__(self,
 
 # {{{ gather_cluster_neighbor_points
 
+@log_process(logger)
 def gather_cluster_neighbor_points(
         actx: PyOpenCLArrayContext,
         pxy: ProxyClusterGeometryData,
diff --git a/pytential/linalg/skeletonization.py b/pytential/linalg/skeletonization.py
index 0f14fe5dc..8feade486 100644
--- a/pytential/linalg/skeletonization.py
+++ b/pytential/linalg/skeletonization.py
@@ -23,15 +23,18 @@
 THE SOFTWARE.
 """
 
+import logging
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
+import numpy.linalg as la
 
 from meshmode.discretization import Discretization
-from pytools import memoize_in, obj_array
+from pytools import log_process, memoize_in, memoize_method, obj_array
 
 from pytential import GeometryCollection, bind, sym
+from pytential.linalg.cluster import ClusterTree, cluster
 from pytential.linalg.direct_solver_symbolic import (
     PROXY_SKELETONIZATION_SOURCE,
     PROXY_SKELETONIZATION_TARGET,
@@ -52,16 +55,23 @@
     from pytential.linalg.proxy import ProxyClusterGeometryData, ProxyGeneratorBase
     from pytential.symbolic.matrix import ClusterMatrixBuilderBase
 
+logger = logging.getLogger(__name__)
+
+
+logger = logging.getLogger(__name__)
+
+logger = logging.getLogger(__name__)
 
 __doc__ = """
 Skeletonization
----------------
+~~~~~~~~~~~~~~~
 
 .. autoclass:: SkeletonizationWrangler
 .. autoclass:: make_skeletonization_wrangler
 
 .. autoclass:: SkeletonizationResult
 .. autofunction:: skeletonize_by_proxy
+.. autofunction:: rec_skeletonize_by_proxy
 """
 
 
@@ -143,7 +153,9 @@ def prg():
             """
             <> ioffset = starts[icluster]
             <> npoints = starts[icluster + 1] - ioffset
-            result[icluster] = reduce(sum, i, waa[indices[i + ioffset]]) / npoints
+            result[icluster] = (
+                reduce(sum, i, abs(waa[indices[i + ioffset]])) / npoints
+                if npoints > 0 else 1.0)
             """,
             lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
             )
@@ -307,9 +319,23 @@ def _evaluate_expr(
                 context=self.context,
                 **kwargs)(expr)
 
+    def evaluate_self(
+            self,
+            actx: PyOpenCLArrayContext,
+            places: GeometryCollection,
+            tgt_src_index: TargetAndSourceClusterList,
+            ibrow: int, ibcol: int,
+            ) -> onp.Array1D[Any]:
+        cls = self.neighbor_cluster_builder
+        return self._evaluate_expr(
+            actx, places, cls, tgt_src_index, self.exprs[ibrow],
+            idomain=ibcol, _weighted=True)
+
     # {{{ nearfield
 
-    def evaluate_source_neighbor_interaction(self,
+    @log_process(logger)
+    def evaluate_source_neighbor_interaction(
+            self,
             actx: PyOpenCLArrayContext,
             places: GeometryCollection,
             pxy: ProxyClusterGeometryData,
@@ -322,11 +348,13 @@ def evaluate_source_neighbor_interaction(self,
         expr = self.exprs[ibrow]
         mat = self._evaluate_expr(
                 actx, places, eval_mapper_cls, nbr_src_index, expr,
-                idomain=ibcol, _weighted=self.weighted_sources)
+                idomain=ibcol, _weighted=True)
 
         return mat, nbr_src_index
 
-    def evaluate_target_neighbor_interaction(self,
+    @log_process(logger)
+    def evaluate_target_neighbor_interaction(
+            self,
             actx: PyOpenCLArrayContext,
             places: GeometryCollection,
             pxy: ProxyClusterGeometryData,
@@ -339,7 +367,7 @@ def evaluate_target_neighbor_interaction(self,
         expr = self.exprs[ibrow]
         mat = self._evaluate_expr(
                 actx, places, eval_mapper_cls, tgt_nbr_index, expr,
-                idomain=ibcol, _weighted=self.weighted_targets)
+                idomain=ibcol, _weighted=True)
 
         return mat, tgt_nbr_index
 
@@ -347,7 +375,9 @@ def evaluate_target_neighbor_interaction(self,
 
     # {{{ proxy
 
-    def evaluate_source_proxy_interaction(self,
+    @log_process(logger)
+    def evaluate_source_proxy_interaction(
+            self,
             actx: PyOpenCLArrayContext,
             places: GeometryCollection,
             pxy: ProxyClusterGeometryData,
@@ -356,10 +386,15 @@ def evaluate_source_proxy_interaction(self,
         ) -> tuple[onp.Array1D[np.inexact], TargetAndSourceClusterList]:
         from pytential.collection import add_geometry_to_collection
         pxy_src_index = TargetAndSourceClusterList(pxy.pxyindex, pxy.srcindex)
+
         places = add_geometry_to_collection(
                 places, {PROXY_SKELETONIZATION_TARGET: pxy.as_targets()}
                 )
 
+        if not self.weighted_sources:
+            logger.warning("Source-Proxy weighting is turned off. This will not give "
+                           "good results for skeletonization.", stacklevel=3)
+
         eval_mapper_cls = self.proxy_source_cluster_builder
         expr = self.source_proxy_exprs[ibrow]
         mat = self._evaluate_expr(
@@ -370,7 +405,9 @@ def evaluate_source_proxy_interaction(self,
 
         return mat, pxy_src_index
 
-    def evaluate_target_proxy_interaction(self,
+    @log_process(logger)
+    def evaluate_target_proxy_interaction(
+            self,
             actx: PyOpenCLArrayContext,
             places: GeometryCollection,
             pxy: ProxyClusterGeometryData, nbrindex: IndexList, *,
@@ -378,6 +415,7 @@ def evaluate_target_proxy_interaction(self,
         ) -> tuple[onp.Array1D[np.inexact], TargetAndSourceClusterList]:
         from pytential.collection import add_geometry_to_collection
         tgt_pxy_index = TargetAndSourceClusterList(pxy.srcindex, pxy.pxyindex)
+
         places = add_geometry_to_collection(
                 places, {PROXY_SKELETONIZATION_SOURCE: pxy.as_sources()}
                 )
@@ -394,6 +432,9 @@ def evaluate_target_proxy_interaction(self,
             mat = _apply_weights(
                     actx, mat, places,
                     tgt_pxy_index, nbrindex, self.domains[ibcol])
+        else:
+            logger.warning("Target-Proxy weighting is turned off. This will not give "
+                           "good results for skeletonization.", stacklevel=3)
 
         return mat, tgt_pxy_index
 
@@ -410,6 +451,7 @@ def make_skeletonization_wrangler(
 
         # internal
         _weighted_proxy: bool | tuple[bool, bool] | None = None,
+        _remove_source_transforms: bool = False,
         _proxy_source_cluster_builder: type[ClusterMatrixBuilderBase] | None = None,
         _proxy_target_cluster_builder: type[ClusterMatrixBuilderBase] | None = None,
         _neighbor_cluster_builder: type[ClusterMatrixBuilderBase] | None = None,
@@ -438,9 +480,13 @@ def make_skeletonization_wrangler(
 
     prepared_lpot_exprs = prepare_expr(places, lpot_exprs, auto_where)
     source_proxy_exprs = prepare_proxy_expr(
-            places, prepared_lpot_exprs, (auto_where[0], PROXY_SKELETONIZATION_TARGET))
+            places, prepared_lpot_exprs, (auto_where[0], PROXY_SKELETONIZATION_TARGET),
+            remove_transforms=_remove_source_transforms)
     target_proxy_exprs = prepare_proxy_expr(
-            places, prepared_lpot_exprs, (PROXY_SKELETONIZATION_SOURCE, auto_where[1]))
+            places, prepared_lpot_exprs, (PROXY_SKELETONIZATION_SOURCE, auto_where[1]),
+            # NOTE: transforms are unconditionally removed here because the
+            # source would be the proxies, where we do not have normals, etc.
+            remove_transforms=True)
 
     # }}}
 
@@ -450,7 +496,7 @@ def make_skeletonization_wrangler(
         weighted_sources = weighted_targets = True
     elif isinstance(_weighted_proxy, bool):
         weighted_sources = weighted_targets = _weighted_proxy
-    elif isinstance(_weighted_proxy, tuple):
+    elif isinstance(_weighted_proxy, tuple) and len(_weighted_proxy) == 2:
         weighted_sources, weighted_targets = _weighted_proxy
     else:
         raise ValueError(f"unknown value for weighting: '{_weighted_proxy}'")
@@ -474,7 +520,10 @@ def make_skeletonization_wrangler(
 
     proxy_target_cluster_builder = _proxy_target_cluster_builder
     if proxy_target_cluster_builder is None:
-        proxy_target_cluster_builder = QBXClusterMatrixBuilder
+        if _remove_source_transforms:
+            proxy_target_cluster_builder = P2PClusterMatrixBuilder
+        else:
+            proxy_target_cluster_builder = QBXClusterMatrixBuilder
 
     # }}}
 
@@ -562,7 +611,8 @@ def _evaluate_proxy_skeletonization_interaction(
         actx: PyOpenCLArrayContext,
         places: GeometryCollection,
         proxy_generator: ProxyGeneratorBase,
-        cluster_index: IndexList, *,
+        source_index: IndexList,
+        target_index: IndexList, *,
         evaluate_proxy: Callable[...,
             tuple[onp.Array1D[np.inexact], TargetAndSourceClusterList]],
         evaluate_neighbor: Callable[...,
@@ -574,23 +624,71 @@ def _evaluate_proxy_skeletonization_interaction(
     each cluster in *cluster_index*.
     """
 
-    if cluster_index.nclusters == 1:
+    if source_index.nclusters == 1:
         raise ValueError("cannot make a proxy skeleton for a single cluster")
 
     from pytential.linalg.proxy import gather_cluster_neighbor_points
-    pxy = proxy_generator(actx, dofdesc, cluster_index)
+    pxy = proxy_generator(actx, dofdesc, source_index)
     nbrindex = gather_cluster_neighbor_points(
-            actx, pxy,
+            actx, pxy, target_index,
             max_particles_in_box=max_particles_in_box)
 
     pxymat, pxy_cluster_index = evaluate_proxy(actx, places, pxy, nbrindex)
     nbrmat, nbr_cluster_index = evaluate_neighbor(actx, places, pxy, nbrindex)
-
-    return _ProxyNeighborEvaluationResult(
+    result = _ProxyNeighborEvaluationResult(
             pxy=pxy,
             pxymat=pxymat, pxyindex=pxy_cluster_index,
             nbrmat=nbrmat, nbrindex=nbr_cluster_index)
 
+    return result
+
+
+def _worker_skeletonize_block_by_proxy(
+        data: tuple[int, onp.Array2D[np.inexact], onp.Array2D[np.inexact]],
+        *,
+        tgt_src_index: TargetAndSourceClusterList,
+        id_rank: int | None,
+        id_eps: float | None,
+        rng: np.random.Generator,
+    ) -> tuple[int,
+               onp.Array2D[np.inexact], onp.Array2D[np.inexact],
+               onp.Array1D[np.integer], onp.Array1D[np.integer]]:
+    from pytential.linalg.utils import interp_decomp
+
+    k = id_rank
+    i, src_mat, tgt_mat = data
+    max_allowable_rank = min(*src_mat.shape, *tgt_mat.shape)
+
+    if __debug__:
+        isfinite = np.isfinite(tgt_mat)
+        assert np.all(isfinite), np.where(~isfinite)
+        isfinite = np.isfinite(src_mat)
+        assert np.all(isfinite), np.where(~isfinite)
+
+    # skeletonize target points
+    k, idx, interp = interp_decomp(tgt_mat.T, rank=k, eps=id_eps, rng=rng)
+    assert 0 < k <= len(idx)
+
+    if k > max_allowable_rank:
+        k = max_allowable_rank
+        interp = interp[:k, :]
+
+    L_i = interp.T
+    skel_tgt_indices_i = tgt_src_index.targets.cluster_indices(i)[idx[:k]]
+    assert L_i.shape == (tgt_mat.shape[0], k)
+
+    # skeletonize source points
+    k, idx, interp = interp_decomp(src_mat, rank=k, eps=None, rng=rng)
+    assert 0 < k <= len(idx)
+
+    R_i = interp
+    skel_src_indices_i = tgt_src_index.sources.cluster_indices(i)[idx[:k]]
+    assert R_i.shape == (k, src_mat.shape[1])
+
+    assert skel_tgt_indices_i.shape == skel_src_indices_i.shape
+
+    return i, L_i, R_i, skel_src_indices_i, skel_tgt_indices_i
+
 
 def _skeletonize_block_by_proxy_with_mats(
         actx: PyOpenCLArrayContext, ibrow: int, ibcol: int,
@@ -615,79 +713,113 @@ def _skeletonize_block_by_proxy_with_mats(
             dofdesc=wrangler.domains[ibcol],
             max_particles_in_box=max_particles_in_box)
 
-    src_result = evaluate_skeletonization_interaction(
-            tgt_src_index.sources,
-            evaluate_proxy=partial(
-                wrangler.evaluate_source_proxy_interaction,
-                ibrow=ibrow, ibcol=ibcol),
-            evaluate_neighbor=partial(
-                wrangler.evaluate_source_neighbor_interaction,
-                ibrow=ibrow, ibcol=ibcol),
-            )
-    tgt_result = evaluate_skeletonization_interaction(
-            tgt_src_index.targets,
-            evaluate_proxy=partial(
-                wrangler.evaluate_target_proxy_interaction,
-                ibrow=ibrow, ibcol=ibcol),
-            evaluate_neighbor=partial(
-                wrangler.evaluate_target_neighbor_interaction,
-                ibrow=ibrow, ibcol=ibcol)
-            )
-
-    src_skl_indices = np.empty(nclusters, dtype=object)
-    tgt_skl_indices = np.empty(nclusters, dtype=object)
+    skel_src_indices = np.empty(nclusters, dtype=object)
+    skel_tgt_indices = np.empty(nclusters, dtype=object)
     skel_starts = np.zeros(nclusters + 1, dtype=np.int32)
 
     L = np.empty(nclusters, dtype=object)
     R = np.empty(nclusters, dtype=object)
 
-    from pytential.linalg.utils import interp_decomp
-
-    for i in range(nclusters):
-        k = id_rank
-        src_mat = np.vstack(src_result[i])
-        tgt_mat = np.hstack(tgt_result[i])
-        max_allowable_rank = min(*src_mat.shape, *tgt_mat.shape)
-
-        if __debug__:
-            isfinite = np.isfinite(tgt_mat)
-            assert np.all(isfinite), np.where(isfinite)
-            isfinite = np.isfinite(src_mat)
-            assert np.all(isfinite), np.where(isfinite)
-
-        # skeletonize target points
-        k, idx, interp = interp_decomp(tgt_mat.T, rank=k, eps=id_eps, rng=rng)
-        assert 0 < k <= len(idx)
-
-        if k > max_allowable_rank:
-            k = max_allowable_rank
-            interp = interp[:k, :]
-
-        L[i] = interp.T
-        tgt_skl_indices[i] = tgt_src_index.targets.cluster_indices(i)[idx[:k]]
-        assert interp.shape == (k, tgt_mat.shape[0])
-
-        # skeletonize source points
-        k, idx, interp = interp_decomp(src_mat, rank=k, eps=None, rng=rng)
-        assert 0 < k <= len(idx)
+    from pytools import ProcessTimer
+
+    with ProcessTimer() as pt:
+        src_result = evaluate_skeletonization_interaction(
+                tgt_src_index.sources, tgt_src_index.targets,
+                evaluate_proxy=partial(
+                    wrangler.evaluate_source_proxy_interaction,
+                    ibrow=ibrow, ibcol=ibcol),
+                evaluate_neighbor=partial(
+                    wrangler.evaluate_source_neighbor_interaction,
+                    ibrow=ibrow, ibcol=ibcol),
+                )
+        tgt_result = evaluate_skeletonization_interaction(
+                tgt_src_index.targets, tgt_src_index.sources,
+                evaluate_proxy=partial(
+                    wrangler.evaluate_target_proxy_interaction,
+                    ibrow=ibrow, ibcol=ibcol),
+                evaluate_neighbor=partial(
+                    wrangler.evaluate_target_neighbor_interaction,
+                    ibrow=ibrow, ibcol=ibcol)
+                )
+    logger.info("_skeletonize_block_by_proxy_with_mats (evaluate): completed (%s)",
+                pt)
+
+    with ProcessTimer() as pt:
+        mats = ((i, np.vstack(src_result[i]), np.hstack(tgt_result[i]))
+                for i in range(nclusters))
+        worker = partial(_worker_skeletonize_block_by_proxy,
+                         tgt_src_index=tgt_src_index,
+                         id_rank=id_rank,
+                         id_eps=id_eps,
+                         rng=rng)
+
+        import multiprocessing
+        import os
+
+        max_workers = int(os.environ.get("PYTENTIAL_HMATRIX_CPU_COUNT",
+                                         multiprocessing.cpu_count()))
+
+        if max_workers == 0:
+            for (i, L_i, R_i, skel_src_i, skel_tgt_i) in (worker(mat) for mat in mats):
+                L[i] = L_i
+                R[i] = R_i
+                skel_tgt_indices[i] = skel_tgt_i
+                skel_src_indices[i] = skel_src_i
+                skel_starts[i + 1] = skel_starts[i] + skel_tgt_i.size
+        else:
+            # NOTE: we have a lot of threads (from Python / OpenCL / OpenBLAS) that
+            # get in here, so the `fork` context does not work. We need the `forkserver`
+            # so that a new clean child is created.
+            context = multiprocessing.get_context("forkserver")
+
+            from concurrent.futures import ProcessPoolExecutor
+            with ProcessPoolExecutor(max_workers=max_workers,
+                                     mp_context=context) as pool:
+
+                for (i, L_i, R_i, skel_src_i, skel_tgt_i) in pool.map(worker, mats):
+                    L[i] = L_i
+                    R[i] = R_i
+                    skel_tgt_indices[i] = skel_tgt_i
+                    skel_src_indices[i] = skel_src_i
+                    skel_starts[i + 1] = skel_starts[i] + skel_tgt_i.size
+    logger.info("_skeletonize_block_by_proxy_with_mats (skeletonize): completed (%s)",
+                pt)
+
+    # evaluate diagonal
+    from pytential.linalg.utils import make_flat_cluster_diag
+    mat = wrangler.evaluate_self(actx, places, tgt_src_index, ibrow, ibcol)
+    D = make_flat_cluster_diag(mat, tgt_src_index)
+
+    from pytential.linalg import make_index_list
+    skel_src_index = make_index_list(np.hstack(list(skel_src_indices)), skel_starts)
+    skel_tgt_index = make_index_list(np.hstack(list(skel_tgt_indices)), skel_starts)
+    skel_tgt_src_index = TargetAndSourceClusterList(skel_tgt_index, skel_src_index)
 
-        R[i] = interp
-        src_skl_indices[i] = tgt_src_index.sources.cluster_indices(i)[idx[:k]]
-        assert interp.shape == (k, src_mat.shape[1])
+    return SkeletonizationResult(
+            L=L, R=R, D=D,
+            tgt_src_index=tgt_src_index, skel_tgt_src_index=skel_tgt_src_index,
+            _src_eval_result=src_result, _tgt_eval_result=tgt_result)
 
-        skel_starts[i + 1] = skel_starts[i] + k
-        assert tgt_skl_indices[i].shape == src_skl_indices[i].shape
 
-    from pytential.linalg.utils import make_index_list
+def _evaluate_root(
+        actx: PyOpenCLArrayContext, ibrow: int, ibcol: int,
+        places: GeometryCollection,
+        wrangler: SkeletonizationWrangler,
+        tgt_src_index: TargetAndSourceClusterList
+        ) -> SkeletonizationResult:
+    assert tgt_src_index.nclusters == 1
 
-    src_skl_index = make_index_list(np.hstack(list(src_skl_indices)), skel_starts)
-    tgt_skl_index = make_index_list(np.hstack(list(tgt_skl_indices)), skel_starts)
-    skel_tgt_src_index = TargetAndSourceClusterList(tgt_skl_index, src_skl_index)
+    from pytential.linalg.utils import make_flat_cluster_diag
+    mat = wrangler.evaluate_self(actx, places, tgt_src_index, ibrow, ibcol)
+    D = make_flat_cluster_diag(mat, tgt_src_index)
 
     return SkeletonizationResult(
-            L=L, R=R,
-            tgt_src_index=tgt_src_index, skel_tgt_src_index=skel_tgt_src_index,
-            _src_eval_result=src_result, _tgt_eval_result=tgt_result)
+        L=obj_array.new_1d([np.eye(*D[0].shape)]),
+        R=obj_array.new_1d([np.eye(*D[0].shape)]),
+        D=D,
+        tgt_src_index=tgt_src_index, skel_tgt_src_index=tgt_src_index,
+        _src_eval_result=None, _tgt_eval_result=None,
+        )
 
 # }}}
 
@@ -719,6 +851,7 @@ class SkeletonizationResult:
 
     .. autoattribute:: L
     .. autoattribute:: R
+    .. autoattribute:: D
     .. autoattribute:: tgt_src_index
     .. autoattribute:: skel_tgt_src_index
     """
@@ -729,6 +862,9 @@ class SkeletonizationResult:
     R: obj_array.ObjectArray1D[onp.Array2D[Any]]
     """An object :class:`~numpy.ndarray` of size ``(nclusters,)`` that contains
     the right block interpolation matrices."""
+    D: obj_array.ObjectArray1D[onp.Array2D[Any]]
+    """An object :class:`~numpy.ndarray` of size ``(nclusters,)`` that contains
+    the dense diagonal blocks."""
 
     tgt_src_index: TargetAndSourceClusterList
     """A :class:`~pytential.linalg.utils.TargetAndSourceClusterList` representing
@@ -766,16 +902,36 @@ def nclusters(self) -> int:
         """Number of clusters that have been skeletonized."""
         return self.tgt_src_index.nclusters
 
+    @property
+    def dtype(self) -> np.dtype[Any]:
+        # FIXME: check that everyone has the same dtype? probably in __post_init__
+        return self.L[0].dtype
 
+    @property
+    @memoize_method
+    def invD(self) -> obj_array.ObjectArray1D[onp.Array2D[np.inexact]]:
+        return obj_array.new_1d([la.inv(D) for D in self.D])
+
+    @property
+    @memoize_method
+    def Dhat(self) -> obj_array.ObjectArray1D[onp.Array2D[np.inexact]]:
+        return obj_array.new_1d([
+            la.inv(self.R[i] @ self.invD[i] @ self.L[i])
+            for i in range(self.nclusters)
+            ])
+
+
+@log_process(logger)
 def skeletonize_by_proxy(
         actx: PyOpenCLArrayContext,
         places: GeometryCollection,
 
         tgt_src_index: TargetAndSourceClusterList,
-        exprs: sym.var | Sequence[sym.var],
+        exprs: ArithmeticExpression | Sequence[ArithmeticExpression],
         input_exprs: sym.var | Sequence[sym.var], *,
         domains: Sequence[Hashable] | None = None,
         context: dict[str, Any] | None = None,
+        auto_where: Any = None,
 
         approx_nproxy: int | None = None,
         proxy_radius_factor: float | None = None,
@@ -784,7 +940,7 @@ def skeletonize_by_proxy(
         id_rank: int | None = None,
         rng: np.random.Generator | None = None,
         max_particles_in_box: int | None = None,
-    ) -> obj_array.ObjectArray2D[onp.Array2D[np.inexact]]:
+    ) -> obj_array.ObjectArray2D[SkeletonizationResult]:
     r"""Evaluate and skeletonize a symbolic expression using proxy-based methods.
 
     :arg tgt_src_index: a :class:`~pytential.linalg.utils.TargetAndSourceClusterList`
@@ -810,21 +966,105 @@ def skeletonize_by_proxy(
     from pytential.linalg.proxy import QBXProxyGenerator
     wrangler = make_skeletonization_wrangler(
             places, exprs, input_exprs,
-            domains=domains, context=context)
+            domains=domains, context=context, auto_where=auto_where)
     proxy = QBXProxyGenerator(places,
             approx_nproxy=approx_nproxy,
             radius_factor=proxy_radius_factor)
 
+    from itertools import product
+
     skels = np.empty((wrangler.nrows, wrangler.ncols), dtype=object)
-    for ibrow in range(wrangler.nrows):
-        for ibcol in range(wrangler.ncols):
-            skels[ibrow, ibcol] = _skeletonize_block_by_proxy_with_mats(
-                    actx, ibrow, ibcol, places, proxy, wrangler, tgt_src_index,
-                    id_eps=id_eps,
-                    id_rank=id_rank,
-                    max_particles_in_box=max_particles_in_box,
-                    rng=rng)
+    for ibrow, ibcol in product(range(wrangler.nrows), range(wrangler.ncols)):
+        skels[ibrow, ibcol] = _skeletonize_block_by_proxy_with_mats(
+                actx, ibrow, ibcol, places, proxy, wrangler, tgt_src_index,
+                id_eps=id_eps, id_rank=id_rank,
+                max_particles_in_box=max_particles_in_box,
+                rng=rng)
 
     return skels
 
 # }}}
+
+
+# {{{ recursive skeletonization by proxy
+
+@log_process(logger)
+def rec_skeletonize_by_proxy(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection,
+
+        ctree: ClusterTree,
+        tgt_src_index: TargetAndSourceClusterList,
+        exprs: ArithmeticExpression | Sequence[ArithmeticExpression],
+        input_exprs: sym.var | Sequence[sym.var], *,
+        domains: Sequence[Hashable] | None = None,
+        context: dict[str, Any] | None = None,
+        auto_where: Any = None,
+
+        approx_nproxy: int | None = None,
+        proxy_radius_factor: float | None = None,
+
+        id_eps: float | None = None,
+        rng: np.random.Generator | None = None,
+        max_particles_in_box: int | None = None,
+
+        _wrangler: SkeletonizationWrangler | None = None,
+        _proxy: ProxyGeneratorBase | None = None,
+    ) -> obj_array.ObjectArray1D[SkeletonizationResult]:
+    r"""Performs recursive skeletonization based on :func:`skeletonize_by_proxy`.
+
+    :returns: an object :class:`~numpy.ndarray` of :class:`SkeletonizationResult`\ s,
+        one per level in *ctree*.
+    """
+
+    assert ctree.nclusters == tgt_src_index.nclusters
+
+    if id_eps is None:
+        id_eps = 1.0e-8
+
+    if _proxy is None:
+        from pytential.linalg.proxy import QBXProxyGenerator
+        proxy: ProxyGeneratorBase = QBXProxyGenerator(places,
+                approx_nproxy=approx_nproxy,
+                radius_factor=proxy_radius_factor)
+    else:
+        proxy = _proxy
+
+    if _wrangler is None:
+        wrangler = make_skeletonization_wrangler(
+                places, exprs, input_exprs,
+                domains=domains, context=context, auto_where=auto_where)
+    else:
+        wrangler = _wrangler
+
+    if wrangler.nrows != 1 or wrangler.ncols != 1:
+        raise NotImplementedError("support for block matrices")
+
+    from itertools import product
+
+    skel_per_level = np.empty(ctree.nlevels, dtype=object)
+    for i, clevel in enumerate(ctree.levels[:-1]):
+        for ibrow, ibcol in product(range(wrangler.nrows), range(wrangler.ncols)):
+            skeleton = _skeletonize_block_by_proxy_with_mats(
+                actx, ibrow, ibcol, proxy.places, proxy, wrangler, tgt_src_index,
+                id_eps=id_eps,
+                # NOTE: we probably never want to set the rank here?
+                id_rank=None,
+                rng=rng,
+                max_particles_in_box=max_particles_in_box)
+
+        skel_per_level[i] = skeleton
+        tgt_src_index = cluster(skeleton.skel_tgt_src_index, clevel)
+
+    assert tgt_src_index.nclusters == 1
+    assert not isinstance(skel_per_level[-1], SkeletonizationResult)
+
+    # evaluate the full root cluster (no skeletonization or anything)
+    skeleton = _evaluate_root(actx, 0, 0, places, wrangler, tgt_src_index)
+    skel_per_level[-1] = skeleton
+
+    return skel_per_level
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/utils.py b/pytential/linalg/utils.py
index ed86aff85..977ce42be 100644
--- a/pytential/linalg/utils.py
+++ b/pytential/linalg/utils.py
@@ -53,6 +53,9 @@
 
 .. autofunction:: make_index_list
 .. autofunction:: make_index_cluster_cartesian_product
+.. autofunction:: make_flat_cluster_diag
+
+.. autofunction:: interp_decomp
 """
 
 InexactT = TypeVar("InexactT", bound=np.inexact)
@@ -479,6 +482,22 @@ def mnorm(x: onp.Array2D[np.inexact],
     return tgt_error, src_error
 
 
+def skeletonization_matrix(
+        mat: np.ndarray, skeleton: SkeletonizationResult,
+        ) -> tuple[np.ndarray, np.ndarray]:
+    D: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+    S: np.ndarray = np.empty((skeleton.nclusters, skeleton.nclusters), dtype=object)
+
+    from itertools import product
+    for i, j in product(range(skeleton.nclusters), repeat=2):
+        if i == j:
+            D[i] = skeleton.tgt_src_index.cluster_take(mat, i, i)
+        else:
+            S[i, j] = skeleton.skel_tgt_src_index.cluster_take(mat, i, j)
+
+    return D, S
+
+
 def skeletonization_error(
         mat: onp.Array2D[np.inexact], skeleton: SkeletonizationResult, *,
         ord: float | None = None,
@@ -540,4 +559,51 @@ def skeletonization_error(
 
 # }}}
 
+
+# {{{ eigenvalues
+
+def eigs(
+        mat, *,
+        k: int = 6,
+        which: str = "LM",
+        maxiter: int | None = None,
+        tol: float = 0.0) -> np.ndarray:
+    import scipy.sparse.linalg as ssla
+
+    result = ssla.eigs(mat,
+            k=k,
+            which=which,
+            maxiter=maxiter,
+            tol=tol,
+            return_eigenvectors=False)
+
+    imag_norm = np.linalg.norm(np.imag(result), ord=np.inf)
+    if imag_norm > 1.0e-14:
+        from warnings import warn
+        warn(f"eigenvalues are not real enough: norm(imag) = {imag_norm:.12e}",
+             stacklevel=2)
+
+    return result
+
+
+def cond(mat, *,
+        mat_inv=None,
+        p: float | None = None,
+        tol: float = 1.0e-6) -> float:
+    if p is None:
+        p = 2
+
+    if p != 2:
+        raise ValueError(f"unsupported norm order: '{p}'")
+
+    lambda_max = eigs(mat, k=1, which="LM", tol=tol)
+    if mat_inv is None:
+        lambda_min = eigs(mat, k=1, which="SM", tol=tol)
+    else:
+        lambda_min = eigs(mat_inv, k=1, which="LM", tol=tol)
+
+    return np.abs(lambda_max) / np.abs(lambda_min)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/pytential/symbolic/matrix.py b/pytential/symbolic/matrix.py
index f51ba1887..2c2cc5165 100644
--- a/pytential/symbolic/matrix.py
+++ b/pytential/symbolic/matrix.py
@@ -561,7 +561,6 @@ def map_int_g(self, expr):
                 expr.target.geometry, expr.target.discr_stage)
 
         actx = self.array_context
-        target_base_kernel = expr.target_kernel.get_base_kernel()
 
         result = 0
         for density, kernel in zip(expr.densities, expr.source_kernels, strict=True):
@@ -575,12 +574,10 @@ def map_int_g(self, expr):
 
             # {{{ generator
 
-            base_kernel = kernel.get_base_kernel()
-
             from sumpy.p2p import P2PMatrixGenerator
             mat_gen = P2PMatrixGenerator(actx.context,
-                    source_kernels=(base_kernel,),
-                    target_kernels=(target_base_kernel,),
+                    source_kernels=(kernel,),
+                    target_kernels=(expr.target_kernel,),
                     exclude_self=self.exclude_self)
 
             # }}}
@@ -590,7 +587,7 @@ def map_int_g(self, expr):
             # {{{ kernel args
 
             # NOTE: copied from pytential.symbolic.primitives.IntG
-            kernel_args = base_kernel.get_args() + base_kernel.get_source_args()
+            kernel_args = kernel.get_args() + kernel.get_source_args()
             kernel_args = {arg.loopy_arg.name for arg in kernel_args}
 
             kernel_args = _get_layer_potential_args(
@@ -775,7 +772,6 @@ def map_int_g(self, expr: prim.IntG):
                 expr.target.geometry, expr.target.discr_stage)
 
         actx = self.array_context
-        target_base_kernel = expr.target_kernel.get_base_kernel()
 
         result = 0
         for kernel, density in zip(expr.source_kernels, expr.densities, strict=True):
@@ -796,12 +792,10 @@ def map_int_g(self, expr: prim.IntG):
 
             # {{{ generator
 
-            base_kernel = kernel.get_base_kernel()
-
             from sumpy.p2p import P2PMatrixSubsetGenerator
             mat_gen = P2PMatrixSubsetGenerator(actx.context,
-                    source_kernels=(base_kernel,),
-                    target_kernels=(target_base_kernel,),
+                    source_kernels=(kernel,),
+                    target_kernels=(expr.target_kernel,),
                     exclude_self=self.exclude_self)
 
             # }}}
@@ -811,7 +805,7 @@ def map_int_g(self, expr: prim.IntG):
             # {{{ kernel args
 
             # NOTE: copied from pytential.symbolic.primitives.IntG
-            kernel_args = [*base_kernel.get_args(), *base_kernel.get_source_args()]
+            kernel_args = [*kernel.get_args(), *kernel.get_source_args()]
             kernel_args = {arg.loopy_arg.name for arg in kernel_args}
 
             kernel_args = _get_layer_potential_args(
diff --git a/test/extra_matrix_data.py b/test/extra_matrix_data.py
index 46ff9686f..d758b9e8c 100644
--- a/test/extra_matrix_data.py
+++ b/test/extra_matrix_data.py
@@ -49,17 +49,26 @@ class MatrixTestCaseMixin:
     proxy_target_cluster_builder: Callable[..., Any] | None = None
     neighbor_cluster_builder: Callable[..., Any] | None = None
 
-    def get_cluster_index(self, actx, places, dofdesc=None):
+    def max_particles_in_box_for_discr(self, discr):
+        max_particles_in_box = self.max_particles_in_box
+        if max_particles_in_box is None:
+            max_particles_in_box = discr.ndofs // self.approx_cluster_count
+
+        return max_particles_in_box
+
+    def get_cluster_index(
+            self, actx, places, dofdesc=None, max_particles_in_box=None):
         if dofdesc is None:
             dofdesc = places.auto_source
         discr = places.get_discretization(dofdesc.geometry)
 
-        max_particles_in_box = self.max_particles_in_box
         if max_particles_in_box is None:
-            max_particles_in_box = discr.ndofs // self.approx_cluster_count
+            max_particles_in_box = self.max_particles_in_box
+            if max_particles_in_box is None:
+                max_particles_in_box = discr.ndofs // self.approx_cluster_count
 
-        from pytential.linalg.proxy import partition_by_nodes
-        cindex = partition_by_nodes(actx, places,
+        from pytential.linalg.cluster import partition_by_nodes
+        cindex, ctree = partition_by_nodes(actx, places,
                 dofdesc=dofdesc,
                 tree_kind=self.tree_kind,
                 max_particles_in_box=max_particles_in_box)
@@ -81,12 +90,14 @@ def get_cluster_index(self, actx, places, dofdesc=None):
             from pytential.linalg import make_index_list
             cindex = make_index_list(subset)
 
-        return cindex
+        return cindex, ctree
 
-    def get_tgt_src_cluster_index(self, actx, places, dofdesc=None):
+    def get_tgt_src_cluster_index(
+            self, actx, places, dofdesc=None, max_particles_in_box=None):
         from pytential.linalg import TargetAndSourceClusterList
-        cindex = self.get_cluster_index(actx, places, dofdesc=dofdesc)
-        return TargetAndSourceClusterList(cindex, cindex)
+        cindex, ctree = self.get_cluster_index(
+            actx, places, dofdesc=dofdesc, max_particles_in_box=max_particles_in_box)
+        return TargetAndSourceClusterList(cindex, cindex), ctree
 
     def get_operator(self, ambient_dim, qbx_forced_limit=_NoArgSentinel):
         knl = self.knl_class(ambient_dim)
diff --git a/test/test_linalg_cluster.py b/test/test_linalg_cluster.py
new file mode 100644
index 000000000..25605993f
--- /dev/null
+++ b/test/test_linalg_cluster.py
@@ -0,0 +1,133 @@
+from __future__ import annotations
+
+
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+
+import extra_matrix_data as extra
+import numpy as np
+import pytest
+
+from arraycontext import pytest_generate_tests_for_array_contexts
+from meshmode import _acf  # noqa: F401
+from meshmode.array_context import PytestPyOpenCLArrayContextFactory
+from meshmode.mesh.generation import NArmedStarfish
+
+from pytential import GeometryCollection
+
+
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+CLUSTER_TEST_CASES = [
+        extra.CurveTestCase(
+            name="starfish",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[64]),
+        extra.TorusTestCase(
+            target_order=4,
+            resolutions=[1])
+        ]
+
+
+# {{{ test_cluster_tree
+
+@pytest.mark.parametrize(("case", "tree_kind"), [
+    (CLUSTER_TEST_CASES[0], None),
+    (CLUSTER_TEST_CASES[0], "adaptive"),
+    (CLUSTER_TEST_CASES[0], "adaptive-level-restricted"),
+    (CLUSTER_TEST_CASES[1], "adaptive"),
+    ])
+def test_cluster_tree(actx_factory, case, tree_kind, visualize=False):
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    from dataclasses import replace
+    actx = actx_factory()
+    case = replace(case, tree_kind=tree_kind)
+    logger.info("\n%s", case)
+
+    discr = case.get_discretization(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(discr, auto_where=case.name)
+
+    srcindex, ctree = case.get_cluster_index(actx, places)
+    assert srcindex.nclusters == ctree.nclusters
+
+    from pytential.linalg.cluster import split_array
+    rng = np.random.default_rng(42)
+    x = split_array(rng.random(srcindex.indices.shape), srcindex)
+
+    logger.info("nclusters %4d nlevels %4d", srcindex.nclusters, ctree.nlevels)
+
+    if visualize and ctree._tree is not None:
+        import matplotlib.pyplot as plt
+        fig = plt.figure(figsize=(10, 10), dpi=300)
+
+        from boxtree.visualization import TreePlotter
+        plotter = TreePlotter(ctree._tree)
+        plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
+        plotter.draw_box_numbers()
+        plotter.set_bounding_box()
+
+        fig.savefig("test_cluster_tree")
+
+    from pytential.linalg.cluster import cluster, uncluster
+    for clevel in ctree.levels:
+        logger.info("======== Level %d", clevel.level)
+        logger.info("box_ids        %s", clevel.box_ids)
+        logger.info("sizes          %s", np.diff(srcindex.starts))
+        logger.info("parent_map     %s", clevel.parent_map)
+
+        assert srcindex.nclusters == clevel.nclusters
+
+        next_srcindex = cluster(srcindex, clevel)
+        for i, ppm in enumerate(clevel.parent_map):
+            partition = np.concatenate([srcindex.cluster_indices(j) for j in ppm])
+
+            assert partition.size == next_srcindex.cluster_size(i)
+            assert np.allclose(partition, next_srcindex.cluster_indices(i))
+
+        y = cluster(x, clevel)
+        z = uncluster(y, srcindex, clevel)
+        assert all(np.allclose(xi, zi) for xi, zi in zip(x, z, strict=True))
+
+        srcindex = next_srcindex
+        x = y
+
+# }}}
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
diff --git a/test/test_linalg_hmatrix.py b/test/test_linalg_hmatrix.py
new file mode 100644
index 000000000..84aabf53f
--- /dev/null
+++ b/test/test_linalg_hmatrix.py
@@ -0,0 +1,593 @@
+from __future__ import annotations
+
+
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+from dataclasses import replace
+
+import extra_matrix_data as extra
+import numpy as np
+import pytest
+
+from arraycontext import pytest_generate_tests_for_array_contexts
+from meshmode import _acf  # noqa: F401
+from meshmode.array_context import PytestPyOpenCLArrayContextFactory
+from meshmode.mesh.generation import NArmedStarfish
+
+from pytential import GeometryCollection, bind, sym
+
+
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+
+HMATRIX_TEST_CASES = [
+        extra.CurveTestCase(
+            name="starfish",
+            op_type="scalar",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[512]),
+        extra.CurveTestCase(
+            name="starfish",
+            op_type="double",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[512]),
+        extra.TorusTestCase(
+            target_order=4,
+            op_type="scalar",
+            resolutions=[0])
+        ]
+
+
+# {{{ test_hmatrix_forward_matvec_single_level
+
+def hmatrix_matvec_single_level(mat, x, skeleton):
+    from pytential.linalg.cluster import split_array
+    targets, sources = skeleton.tgt_src_index
+    y = split_array(x, sources)
+
+    y_hat = np.empty(y.shape, dtype=object)
+
+    for i in range(skeleton.nclusters):
+        y_hat[i] = skeleton.R[i] @ y[i]
+
+    from pytential.linalg.utils import skeletonization_matrix
+    D, S = skeletonization_matrix(mat, skeleton)
+    syhat = np.zeros(y.shape, dtype=object)
+
+    from itertools import product
+    for i, j in product(range(skeleton.nclusters), repeat=2):
+        if i == j:
+            continue
+
+        syhat[i] = syhat[i] + S[i, j] @ y_hat[j]
+
+    for i in range(skeleton.nclusters):
+        y[i] = D[i] @ y[i] + skeleton.L[i] @ syhat[i]
+
+    return np.concatenate(y)[np.argsort(targets.indices)]
+
+
+@pytest.mark.parametrize("case", HMATRIX_TEST_CASES)
+@pytest.mark.parametrize("discr_stage", [sym.QBX_SOURCE_STAGE1])
+def test_hmatrix_forward_matvec_single_level(
+        actx_factory, case, discr_stage, visualize=False):
+    actx = actx_factory()
+    rng = np.random.default_rng(42)
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.15}
+    else:
+        kwargs = {"proxy_approx_count": 256, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(actx, places, dd)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{ construct reference
+
+    from pytential.linalg.direct_solver_symbolic import prepare_expr
+    from pytential.symbolic.matrix import MatrixBuilder
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+    sym_op_prepr, = prepare_expr(places, [sym_op], (dd, dd))
+    mat = MatrixBuilder(
+        actx,
+        dep_expr=sym_u,
+        other_dep_exprs=[],
+        dep_discr=density_discr,
+        places=places,
+        context={},
+        )(sym_op_prepr)
+
+    from arraycontext import flatten, unflatten
+    x = actx.thaw(density_discr.nodes()[0])
+    y = actx.to_numpy(flatten(x, actx))
+    r_lpot = unflatten(x, actx.from_numpy(mat @ y), actx)
+
+    # }}}
+
+    # {{{ check matvec
+
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    from pytential.linalg.skeletonization import skeletonize_by_proxy
+    for i in range(id_eps.size):
+        skeleton = skeletonize_by_proxy(
+            actx, places, tgt_src_index, sym_op, sym_u,
+            domains=[dd], context={},
+            approx_nproxy=case.proxy_approx_count,
+            proxy_radius_factor=case.proxy_radius_factor,
+            id_eps=id_eps[i],
+            rng=rng,
+            )
+        r_hmat = hmatrix_matvec_single_level(mat, y, skeleton[0, 0])
+        r_hmat = unflatten(x, actx.from_numpy(r_hmat), actx)
+
+        from meshmode.dof_array import flat_norm
+        rec_error[i] = actx.to_numpy(
+            flat_norm(r_hmat - r_lpot) / flat_norm(r_lpot)
+            )
+        logger.info("id_eps %.2e error: %.12e", id_eps[i], rec_error[i])
+        # assert rec_error[i] < 0.1
+
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    basename = "linalg_hmatrix_single_matvec"
+    fig.savefig(f"{basename}_{case.name}_{case.op_type}_convergence")
+
+    if case.ambient_dim == 2:
+        fig.clf()
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        r_hmap = actx.to_numpy(flatten(r_hmat, actx))
+        r_lpot = actx.to_numpy(flatten(r_lpot, actx))
+
+        ax.semilogy(r_hmap - r_lpot)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"{basename}_{case.name}_{case.op_type}_error")
+
+    pt.close(fig)
+
+# }}}
+
+
+# {{{ test_hmatrix_forward_matvec
+
+@pytest.mark.parametrize("case", [
+    HMATRIX_TEST_CASES[0],
+    HMATRIX_TEST_CASES[1],
+    pytest.param(HMATRIX_TEST_CASES[2], marks=pytest.mark.slowtest),
+    ])
+@pytest.mark.parametrize("discr_stage", [
+    sym.QBX_SOURCE_STAGE1,
+    # sym.QBX_SOURCE_STAGE2
+    ])
+def test_hmatrix_forward_matvec(
+        actx_factory, case, discr_stage, p2p=False, visualize=False):
+    actx = actx_factory()
+    rng = np.random.default_rng(42)
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+    else:
+        kwargs = {"proxy_approx_count": 256, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    max_particles_in_box = case.max_particles_in_box_for_discr(density_discr)
+
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(
+        actx, places, dd, max_particles_in_box=max_particles_in_box)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{ construct hmatrix
+
+    from pytential.linalg.hmatrix import build_hmatrix_by_proxy
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    x = actx.thaw(density_discr.nodes()[0])
+
+    if p2p:
+        # NOTE: this also needs changed in `build_hmatrix_by_proxy`
+        # to actually evaluate the p2p interactions instead of qbx
+        from pytential.linalg.direct_solver_symbolic import prepare_expr
+        from pytential.symbolic.matrix import P2PMatrixBuilder
+        mat = P2PMatrixBuilder(
+            actx,
+            dep_expr=sym_u,
+            other_dep_exprs=[],
+            dep_discr=density_discr,
+            places=places,
+            context={},
+            )(prepare_expr(places, sym_op, (dd, dd)))
+
+        from arraycontext import flatten, unflatten
+        y = actx.to_numpy(flatten(x, actx))
+        r_lpot = unflatten(x, actx.from_numpy(mat @ y), actx)
+    else:
+        r_lpot = bind(places, sym_op, auto_where=dd)(actx, u=x)
+
+    from pytential.linalg.hmatrix import hmatrix_error_from_param
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+    model_error = np.zeros_like(id_eps)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    for i in range(id_eps.size):
+        wrangler = build_hmatrix_by_proxy(
+            actx, places, sym_op, sym_u,
+            domains=[dd],
+            context=case.knl_concrete_kwargs,
+            id_eps=id_eps[i],
+            rng=rng,
+            _tree_kind=case.tree_kind,
+            _max_particles_in_box=max_particles_in_box,
+            _approx_nproxy=case.proxy_approx_count,
+            _proxy_radius_factor=case.proxy_radius_factor,
+            )
+        hmat = wrangler.get_forward()
+
+        # {{{ skeletonization error
+
+        from meshmode.dof_array import flat_norm
+        r_hmap = hmat @ x
+        rec_error[i] = actx.to_numpy(
+            flat_norm(r_hmap - r_lpot) / flat_norm(r_lpot)
+            )
+
+        # }}}
+
+        # {{{ model error
+
+        skeleton = hmat.skeletons[0]
+        icluster = np.argmax(np.diff(skeleton.skel_tgt_src_index.targets.starts))
+
+        proxy_radius = actx.to_numpy(
+            skeleton._src_eval_result.pxy.radii[icluster]
+            )
+        cluster_radius = actx.to_numpy(
+            skeleton._src_eval_result.pxy.cluster_radii[icluster]
+            )
+
+        model_error[i] = hmatrix_error_from_param(
+            places.ambient_dim,
+            id_eps=id_eps[i],
+            min_proxy_radius=proxy_radius,
+            max_cluster_radius=cluster_radius,
+            id_rank=skeleton.skel_tgt_src_index.targets.cluster_size(icluster),
+            nproxies=skeleton._src_eval_result.pxy.pxyindex.cluster_size(icluster),
+            ntargets=skeleton.tgt_src_index.targets.cluster_size(icluster),
+            nsources=skeleton.tgt_src_index.targets.cluster_size(icluster),
+            c=1.0e-8
+            )
+
+        # }}}
+
+        logger.info("id_eps %.2e error: %.12e (%.12e)",
+            id_eps[i], rec_error[i], model_error[i])
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    if not visualize:
+        assert eoc.order_estimate() > 0.6
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+    ax.loglog(id_eps, model_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    lpot_name = "p2p" if p2p else "qbx"
+    basename = f"linalg_hmatrix_{lpot_name}_matvec"
+    fig.savefig(f"{basename}_{case.name}_{case.op_type}_convergence")
+
+    if case.ambient_dim == 2:
+        fig.clf()
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        r_hmap = actx.to_numpy(flatten(r_hmap, actx))
+        r_lpot = actx.to_numpy(flatten(r_lpot, actx))
+
+        ax.semilogy(r_hmap - r_lpot)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"{basename}_{case.name}_{case.op_type}_error")
+
+    pt.close(fig)
+
+# }}}
+
+
+# {{{ test_hmatrix_backward_matvec
+
+@pytest.mark.parametrize("case", [
+    HMATRIX_TEST_CASES[0],
+    HMATRIX_TEST_CASES[1],
+    pytest.param(HMATRIX_TEST_CASES[2], marks=pytest.mark.slowtest),
+    ])
+@pytest.mark.parametrize("discr_stage", [
+    sym.QBX_SOURCE_STAGE1,
+    # sym.QBX_SOURCE_STAGE2
+    ])
+def test_hmatrix_backward_matvec(actx_factory, case, discr_stage, visualize=False):
+    actx = actx_factory()
+    rng = np.random.default_rng(42)
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+    else:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    max_particles_in_box = case.max_particles_in_box_for_discr(density_discr)
+
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(
+        actx, places, dd, max_particles_in_box=max_particles_in_box)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{
+
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    if visualize:
+        from pytential.linalg.direct_solver_symbolic import prepare_expr
+        from pytential.symbolic.matrix import MatrixBuilder
+        mat = MatrixBuilder(
+            actx,
+            dep_expr=sym_u,
+            other_dep_exprs=[],
+            dep_discr=density_discr,
+            places=places,
+            context={},
+            )(prepare_expr(places, sym_op, (dd, dd)))
+
+        import pytential.linalg.utils as hla
+        eigs_ref = hla.eigs(mat, k=5)
+        kappa_ref = np.linalg.cond(mat, p=2)
+
+    # }}}
+
+    # {{{ construct hmatrix
+
+    from pytential.linalg.hmatrix import build_hmatrix_by_proxy
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    x_ref = actx.thaw(density_discr.nodes()[0])
+    b_ref = bind(places, sym_op, auto_where=dd)(actx, u=x_ref)
+
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+
+    if visualize:
+        rec_eigs = np.zeros((id_eps.size, eigs_ref.size), dtype=np.complex128)
+        rec_kappa = np.zeros(id_eps.size)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    for i in range(id_eps.size):
+        wrangler = build_hmatrix_by_proxy(
+            actx, places, sym_op, sym_u,
+            domains=[dd],
+            context=case.knl_concrete_kwargs,
+            id_eps=id_eps[i],
+            rng=rng,
+            _tree_kind=case.tree_kind,
+            _max_particles_in_box=max_particles_in_box,
+            _approx_nproxy=case.proxy_approx_count,
+            _proxy_radius_factor=case.proxy_radius_factor,
+            )
+
+        hmat_inv = wrangler.get_backward()
+        x_hmat = hmat_inv @ b_ref
+
+        if visualize:
+            hmat = wrangler.get_forward()
+            rec_eigs[i, :] = hla.eigs(hmat, k=5, tol=1.0e-6)
+            rec_kappa[i] = hla.cond(hmat, p=2, tol=1.0e-6)
+
+            logger.info("eigs: %s %s", eigs_ref, rec_eigs[i])
+            logger.info("kappa %.12e %.12e", kappa_ref, rec_kappa[i])
+
+        from meshmode.dof_array import flat_norm
+        rec_error[i] = actx.to_numpy(
+            flat_norm(x_hmat - x_ref) / flat_norm(x_ref)
+            )
+        logger.info("id_eps %.2e error: %.12e", id_eps[i], rec_error[i])
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    if not visualize:
+        assert eoc.order_estimate() > 0.6
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+
+    # {{{ convergence
+
+    ax = fig.gca()
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_convergence")
+    fig.clf()
+
+    # }}}
+
+    # {{{ eigs
+
+    ax = fig.gca()
+    ax.plot(np.real(eigs_ref), np.imag(eigs_ref), "ko")
+    for i in range(id_eps.size):
+        ax.plot(np.real(rec_eigs[i]), np.imag(rec_eigs[i]), "v")
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\Re \lambda$")
+    ax.set_ylabel(r"$\Im \lambda$")
+
+    fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_eigs")
+    fig.clf()
+
+    # }}}
+
+    if case.ambient_dim == 2:
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        x_hmat = actx.to_numpy(flatten(x_hmat, actx))
+        x_ref = actx.to_numpy(flatten(x_ref, actx))
+
+        ax.semilogy(x_hmat - x_ref)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_error")
+        fig.clf()
+
+    pt.close(fig)
+
+# }}}
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
diff --git a/test/test_linalg_proxy.py b/test/test_linalg_proxy.py
index 71d2df405..a3262f2e8 100644
--- a/test/test_linalg_proxy.py
+++ b/test/test_linalg_proxy.py
@@ -221,7 +221,7 @@ def test_partition_points(
     places = GeometryCollection(qbx, auto_where=case.name)
 
     density_discr = places.get_discretization(case.name)
-    mindex = case.get_cluster_index(actx, places)
+    mindex, _ = case.get_cluster_index(actx, places)
 
     expected_indices = np.arange(0, density_discr.ndofs)
     assert mindex.starts[-1] == density_discr.ndofs
@@ -268,7 +268,7 @@ def test_proxy_generator(actx_factory: ArrayContextFactory, case,
     places = GeometryCollection(qbx, auto_where=case.name)
 
     density_discr = places.get_discretization(case.name)
-    cindex = case.get_cluster_index(actx, places)
+    cindex, _ = case.get_cluster_index(actx, places)
 
     generator = proxy_generator_cls(places,
             approx_nproxy=case.proxy_approx_count,
@@ -314,7 +314,7 @@ def test_proxy_generator(actx_factory: ArrayContextFactory, case,
     ProxyGenerator, QBXProxyGenerator,
     ])
 @pytest.mark.parametrize("index_sparsity_factor", [1.0, 0.6])
-@pytest.mark.parametrize("proxy_radius_factor", [1, 1.1])
+@pytest.mark.parametrize("proxy_radius_factor", [1.0, 1.1])
 def test_neighbor_points(actx_factory: ArrayContextFactory, case,
         proxy_generator_cls, index_sparsity_factor, proxy_radius_factor,
         visualize=False):
@@ -337,7 +337,7 @@ def test_neighbor_points(actx_factory: ArrayContextFactory, case,
     dofdesc = places.auto_source
 
     density_discr = places.get_discretization(dofdesc.geometry)
-    srcindex = case.get_cluster_index(actx, places)
+    srcindex, _ = case.get_cluster_index(actx, places)
 
     # generate proxy points
     generator = proxy_generator_cls(places,
@@ -347,7 +347,7 @@ def test_neighbor_points(actx_factory: ArrayContextFactory, case,
 
     # get neighboring points
     from pytential.linalg.proxy import gather_cluster_neighbor_points
-    nbrindex = gather_cluster_neighbor_points(actx, pxy)
+    nbrindex = gather_cluster_neighbor_points(actx, pxy, srcindex)
 
     pxy = pxy.to_numpy(actx)
     nodes = actx.to_numpy(
diff --git a/test/test_linalg_skeletonization.py b/test/test_linalg_skeletonization.py
index 76ea5e995..a2c1fb809 100644
--- a/test/test_linalg_skeletonization.py
+++ b/test/test_linalg_skeletonization.py
@@ -127,39 +127,124 @@ def test_skeletonize_symbolic(actx_factory: ArrayContextFactory, case, visualize
     places = GeometryCollection(qbx, auto_where=dd)
 
     density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
-    tgt_src_index = case.get_tgt_src_cluster_index(actx, places, dd)
+    tgt_src_index, ctree = case.get_tgt_src_cluster_index(actx, places, dd)
 
     logger.info("nclusters %3d ndofs %7d",
             tgt_src_index.nclusters, density_discr.ndofs)
 
     # }}}
 
-    # {{{ wranglers
+    from pytential.linalg.skeletonization import rec_skeletonize_by_proxy
 
-    from pytential.linalg.proxy import QBXProxyGenerator
-    proxy_generator = QBXProxyGenerator(places,
-            radius_factor=case.proxy_radius_factor,
-            approx_nproxy=case.proxy_approx_count)
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+    rec_skeletonize_by_proxy(
+        actx, places, ctree, tgt_src_index, sym_op, sym_u,
+        context=case.knl_concrete_kwargs,
+        auto_where=dd,
+        id_eps=1.0e-8,
+        rng=rng
+    )
+
+# }}}
+
+
+# {{{ test_skeletonize_diagonal
+
+@pytest.mark.parametrize("case", [
+    SKELETONIZE_TEST_CASES[0],
+    SKELETONIZE_TEST_CASES[1],
+    SKELETONIZE_TEST_CASES[2],
+    ])
+def test_skeletonize_diagonal(actx_factory, case, visualize=False):
+    import scipy.linalg.interpolative as sli
+    sli.seed(42)
+
+    actx = actx_factory()
+    rng = np.random.default_rng(42)
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    # {{{ setup
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    resolution = case.resolutions[-1]
+
+    qbx = case.get_layer_potential(actx, resolution, case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    tgt_src_index, ctree = case.get_tgt_src_cluster_index(actx, places, dd)
 
-    from pytential.linalg.skeletonization import make_skeletonization_wrangler
     sym_u, sym_op = case.get_operator(places.ambient_dim)
-    wrangler = make_skeletonization_wrangler(places, sym_op, sym_u,
-            domains=None,
-            context=case.knl_concrete_kwargs,
-            _weighted_proxy=case.weighted_proxy,
-            _proxy_source_cluster_builder=case.proxy_source_cluster_builder,
-            _proxy_target_cluster_builder=case.proxy_target_cluster_builder,
-            _neighbor_cluster_builder=case.neighbor_cluster_builder)
 
     # }}}
 
-    from pytential.linalg.skeletonization import _skeletonize_block_by_proxy_with_mats
+    # {{{ check
 
-    _skeletonize_block_by_proxy_with_mats(
-        actx, 0, 0, places, proxy_generator, wrangler, tgt_src_index,
-        id_eps=1.0e-8,
+    from pytential.linalg.skeletonization import make_skeletonization_wrangler
+    wrangler = make_skeletonization_wrangler(
+            places, sym_op, sym_u,
+            auto_where=dd, context=case.knl_concrete_kwargs)
+
+    from pytential.linalg.skeletonization import rec_skeletonize_by_proxy
+    skeletons = rec_skeletonize_by_proxy(
+        actx, places, ctree, tgt_src_index, sym_op, sym_u,
+        auto_where=dd,
+        context=case.knl_concrete_kwargs,
+        approx_nproxy=case.proxy_approx_count,
+        proxy_radius_factor=case.proxy_radius_factor,
+        id_eps=case.id_eps,
         rng=rng,
-    )
+        _wrangler=wrangler,
+        )
+
+    from pytential.linalg.hmatrix import _update_skeleton_diagonal
+    for i in range(1, skeletons.size):
+        skeletons[i] = _update_skeleton_diagonal(
+            skeletons[i], skeletons[i - 1], ctree.levels[i - 1],
+            )
+
+    from pytential.linalg.cluster import cluster
+    parent = None
+    for k, clevel in enumerate(ctree.levels):
+        from pytential.linalg.utils import make_flat_cluster_diag
+
+        tgt_src_index = skeletons[k].tgt_src_index
+        D1 = wrangler.evaluate_self(actx, places, tgt_src_index, 0, 0)
+        D1 = make_flat_cluster_diag(D1, tgt_src_index)
+
+        if k == 0:
+            D = D0 = D1
+        else:
+            skel_tgt_src_index = skeletons[k - 1].skel_tgt_src_index
+            assert skel_tgt_src_index.shape == tgt_src_index.shape
+
+            D0 = wrangler.evaluate_self(actx, places, skel_tgt_src_index, 0, 0)
+            D0 = cluster(make_flat_cluster_diag(D0, skel_tgt_src_index), parent)
+
+            D = D1 - D0
+
+        parent = clevel
+
+        assert D1.shape == (skeletons[k].nclusters,)
+        assert D1.shape == D0.shape, (D1.shape, D0.shape)
+        assert D1.shape == D.shape, (D1.shape, D.shape)
+
+        for i in range(skeletons[k].nclusters):
+            assert D1[i].shape == skeletons[k].tgt_src_index.cluster_shape(i, i)
+            assert D1[i].shape == D0[i].shape, (D1[i].shape, D0[i].shape)
+
+            error = la.norm(D[i] - skeletons[k].D[i]) / (la.norm(D[i]) + 1.0e-12)
+            logger.info("level %04d / %04d cluster %3d (%4d, %4d) error %.12e",
+                k, ctree.nlevels,
+                ctree.tree_cluster_parent_ids[clevel.box_ids][i],
+                *skeletons[k].tgt_src_index.cluster_shape(i, i), error)
+
+            assert error < 1.0e-15
+
+        logger.info("")
+
+    # }}}
 
 # }}}
 
@@ -186,7 +271,7 @@ def run_skeletonize_by_proxy(actx, case, resolution,
 
     density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
     if tgt_src_index is None:
-        tgt_src_index = case.get_tgt_src_cluster_index(actx, places, dd)
+        tgt_src_index, _ = case.get_tgt_src_cluster_index(actx, places, dd)
 
     logger.info("nclusters %3d ndofs %7d",
             tgt_src_index.nclusters, density_discr.ndofs)
@@ -362,19 +447,18 @@ def intersect1d(x, y):
 
     # }}}
 
-    return err_f, (places, mat)
+    return err_f, (places, mat, skeleton)
 
 
 @pytest.mark.parametrize("case", [
-    # NOTE: skip 2d tests, since they're better checked for convergence in
-    # `test_skeletonize_by_proxy_convergence`
-    # SKELETONIZE_TEST_CASES[0], SKELETONIZE_TEST_CASES[1],
+    SKELETONIZE_TEST_CASES[0],
+    SKELETONIZE_TEST_CASES[1],
     SKELETONIZE_TEST_CASES[2],
     ])
 def test_skeletonize_by_proxy(actx_factory: ArrayContextFactory, case, visualize=False):
-    r"""Test single-level skeletonization accuracy. Checks that the error
-    satisfies :math:`e < c \epsilon_{id}` for a fixed ID tolerance and an
-    empirically determined (not too huge) :math:`c`.
+    r"""Test multilevel skeletonization accuracy. Checks that the error for
+    every level satisfies :math:`e < c \epsilon_{id}` for a fixed ID tolerance
+    and an empirically determined (not too huge) :math:`c`.
     """
 
     import scipy.linalg.interpolative as sli
@@ -390,13 +474,27 @@ def test_skeletonize_by_proxy(actx_factory: ArrayContextFactory, case, visualize
     case = replace(case, approx_cluster_count=6, id_eps=1.0e-8)
     logger.info("\n%s", case)
 
-    run_skeletonize_by_proxy(
-        actx, case, case.resolutions[0],
-        ctol=10 * case.id_eps,
-        # FIXME: why is the 3D error so large?
-        rtol=10**case.ambient_dim * case.id_eps,
-        rng=rng,
-        visualize=visualize)
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[0], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    tgt_src_index, ctree = case.get_tgt_src_cluster_index(actx, places, dd)
+    mat = None
+
+    from pytential.linalg.cluster import cluster
+    for clevel in ctree.levels[:-1]:
+        logger.info("[%2d/%2d] nclusters %3d",
+            clevel.level, ctree.nlevels, clevel.nclusters)
+
+        _, (_, mat, skeleton) = run_skeletonize_by_proxy(
+            actx, case, case.resolutions[0],
+            ctol=10 * case.id_eps,
+            # FIXME: why is the 3D error so large?
+            rtol=10**case.ambient_dim * case.id_eps,
+            places=places, mat=mat, rng=rng, tgt_src_index=tgt_src_index,
+            visualize=visualize)
+
+        tgt_src_index = cluster(skeleton.skel_tgt_src_index, clevel)
 
 # }}}
 
@@ -466,7 +564,7 @@ def test_skeletonize_by_proxy_convergence(
         # NOTE: don't skeletonize anymore if we reached zero error, but we still
         # want to loop to do `eoc.add_data_point()`
         if not was_zero:
-            rec_error[i], (places, mat) = run_skeletonize_by_proxy(
+            rec_error[i], (places, mat, _) = run_skeletonize_by_proxy(
                 actx, case, r, places=places, mat=mat,
                 suffix=f"{suffix}_{i:04d}", rng=rng, visualize=False)
 
diff --git a/test/test_matrix.py b/test/test_matrix.py
index c12c2c54f..e06a0015f 100644
--- a/test/test_matrix.py
+++ b/test/test_matrix.py
@@ -367,7 +367,7 @@ def test_cluster_builder(
 
     # {{{ matrix
 
-    mindex = case.get_tgt_src_cluster_index(actx, places)
+    mindex, _ = case.get_tgt_src_cluster_index(actx, places)
     kwargs = {
             "dep_expr": sym_u,
             "other_dep_exprs": [],
@@ -495,8 +495,8 @@ def test_build_matrix_fixed_stage(
     logger.info("ndofs:         %d", target_discr.ndofs)
 
     from pytential.linalg import TargetAndSourceClusterList
-    itargets = case.get_cluster_index(actx, places, target_dd)
-    jsources = case.get_cluster_index(actx, places, source_dd)
+    itargets, _ = case.get_cluster_index(actx, places, target_dd)
+    jsources, _ = case.get_cluster_index(actx, places, source_dd)
     mindex = TargetAndSourceClusterList(itargets, jsources)
 
     kwargs = {