diff --git a/CONTRIBUTING_MAC.md b/CONTRIBUTING_MAC.md
new file mode 100644
index 00000000..1fcf6f38
--- /dev/null
+++ b/CONTRIBUTING_MAC.md
@@ -0,0 +1,17 @@
+# Contributing As a Mac User
+
+## Installation
+
+For M1 Macs (Apple Silicon), various [online sources](https://github.com/scipy/scipy/issues/13409) recommend using OpenBLAS as a native (not Rosetta) linear algebra backend for dependencies like NumPy.
+```sh
+brew install openblas
+export OPENBLAS=$(/opt/homebrew/bin/brew --prefix openblas)
+```
+
+As [documented by Ray](https://docs.ray.io/en/latest/ray-overview/installation.html#m1-mac-apple-silicon-support), one must install miniforge, a community-driven Conda installer, for packages that support `arm64`. In particular, the GRPCIO package from pip will not work with Ray, so it must be uninstalled (`pip uninstall grpcio`) and replaced with the one from miniforge (`conda install grpcio`).
+
+
+## Testing
+
+`tests/core/backend/test_backend_init.py::test_head_detection`
+As [documented by Ray](https://github.com/ray-project/ray/issues/24130), there is different default behavior for fetching node IP addresses. On Linux, a private address (e.g. `192.168.0.1`) is returned, while on Windows and Mac, `127.0.0.1` is returned. As NumS is primarily released for Linux, this test will fail for Mac contributors, but it shouldn't affect development.
diff --git a/nums/core/array/application.py b/nums/core/array/application.py
index 48654617..9bc47aae 100644
--- a/nums/core/array/application.py
+++ b/nums/core/array/application.py
@@ -230,7 +230,11 @@ def loadtxt(
     def scalar(self, value):
         return BlockArray.from_scalar(value, self.km)
 
-    def array(self, array: Union[np.ndarray, sparse.COO, List[float]], block_shape: tuple = None):
+    def array(
+        self,
+        array: Union[np.ndarray, sparse.COO, List[float]],
+        block_shape: tuple = None,
+    ):
         if not isinstance(array, (np.ndarray, sparse.COO)):
             if array_utils.is_array_like(array):
                 array = np.array(array)
@@ -917,7 +921,7 @@ def map_uop(
             # TODO(hme): Faster to create ndarray first,
             #  and instantiate block array on return
             #  to avoid instantiating blocks on BlockArray initialization.
-            rarr.blocks[grid_entry] = arr.blocks[grid_entry].uop_map(
+            rarr.blocks[grid_entry] = arr.blocks[grid_entry].map_uop(
                 op_name, args=args, kwargs=kwargs
             )
         return rarr
@@ -928,7 +932,7 @@ def matmul(self, arr_1: BlockArray, arr_2: BlockArray) -> BlockArray:
     def tensordot(
         self, arr_1: BlockArray, arr_2: BlockArray, axes: int = 2
     ) -> BlockArray:
-        return arr_1.tensordot(arr_2, axes)
+        return arr_1.tensordot(arr_1, arr_2, axes)
 
     def einsum(self, subscript, *operands):
         def _compute_syskwargs(blocks):
diff --git a/nums/core/array/base.py b/nums/core/array/base.py
index 9d0e34c5..a09bbbb1 100644
--- a/nums/core/array/base.py
+++ b/nums/core/array/base.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 
 
-# pylint: disable = protected-access
-
-
+import warnings
 import numpy as np
 
 from nums.core.array import utils as array_utils
@@ -23,11 +21,11 @@
 from nums.core.grid.grid import ArrayGrid
 
 
-class Block:
-    # pylint: disable=redefined-builtin, global-statement
-    # TODO(hme): Create a base class, and move this concrete class into blockarray.py.
-    #  Do this when we implement a SparseBlock object.
+# pylint: disable=protected-access, redefined-builtin
+# pylint: disable=too-many-lines
 
+
+class BlockBase:
     block_id_counter = -1
 
     def __init__(
@@ -40,7 +38,7 @@ def __init__(
         km: KernelManager,
         id=None,
     ):
-        self._km = km
+        self.km = km
         self.grid_entry: tuple = grid_entry
         self.grid_shape: tuple = grid_shape
         self.oid: np.object = None
@@ -54,33 +52,35 @@ def __init__(
             self.id = Block.block_id_counter
         # Set if a device id was used to compute this block.
         self._device = None
-        self.fill_value = None
 
     @property
-    def nnz(self):
-        return np.prod(self.shape)
+    def is_dense(self):
+        raise NotImplementedError()
 
     @property
-    def is_dense(self):
-        return self.fill_value is None
+    def nbytes(self):
+        raise NotImplementedError()
 
     def __repr__(self):
-        return "Block(" + str(self.oid) + ")"
+        raise NotImplementedError()
 
     def size(self):
         return np.product(self.shape)
 
     def copy(self, shallow=True):
-        assert shallow, "Only shallow copies are currently supported."
-        block = Block(
-            self.grid_entry,
-            self.grid_shape,
-            self.shape,
-            self.dtype,
-            self.transposed,
-            self._km,
+        raise NotImplementedError()
+
+    def get(self):
+        return self.km.get(self.oid)
+
+    def astype(self, dtype):
+        block = self.copy()
+        block.dtype = dtype
+        block.oid = self.km.astype(
+            self.oid,
+            dtype.__name__,
+            syskwargs={"grid_entry": block.grid_entry, "grid_shape": block.grid_shape},
         )
-        block.oid = self.oid
         return block
 
     def true_grid_entry(self):
@@ -93,26 +93,19 @@ def true_grid_shape(self):
             return tuple(reversed(self.grid_shape))
         return self.grid_shape
 
-    def device(self):
-        if self._device is not None:
-            return self._device
-        return self._km.device_grid.get_device(
-            self.true_grid_entry(), self.true_grid_shape()
-        )
-
     def transpose(self, defer=False, redistribute=False):
         # If defer is True, this operation does not modify the remote object.
         # If defer is True and redistribute is False,
         # this operation does not move the remote object.
         grid_entryT = tuple(reversed(self.grid_entry))
         grid_shapeT = tuple(reversed(self.grid_shape))
-        blockT = Block(
+        blockT = type(self)(
             grid_entry=grid_entryT,
             grid_shape=grid_shapeT,
             shape=tuple(reversed(self.shape)),
             dtype=self.dtype,
             transposed=not self.transposed,
-            km=self._km,
+            km=self.km,
         )
         blockT.oid = self.oid
         if not defer:
@@ -124,9 +117,16 @@ def transpose(self, defer=False, redistribute=False):
                     "grid_entry": self.grid_entry,
                     "grid_shape": self.grid_shape,
                 }
-            blockT.oid = self._km.transpose(self.oid, syskwargs=syskwargs)
+            blockT.oid = self.km.transpose(self.oid, syskwargs=syskwargs)
         return blockT
 
+    def device(self):
+        if self._device is not None:
+            return self._device
+        return self.km.device_grid.get_device(
+            self.true_grid_entry(), self.true_grid_shape()
+        )
+
     def swapaxes(self, axis1, axis2):
         block = self.copy()
         grid_entry = list(block.grid_entry)
@@ -141,7 +141,7 @@ def swapaxes(self, axis1, axis2):
         block.grid_shape = tuple(grid_shape)
         block.shape = tuple(shape)
 
-        block.oid = self._km.swapaxes(
+        block.oid = self.km.swapaxes(
             block.oid,
             axis1,
             axis2,
@@ -149,48 +149,20 @@ def swapaxes(self, axis1, axis2):
         )
         return block
 
+    def map_uop(self, op_name, args=None, kwargs=None, device=None):
+        raise NotImplementedError()
+
     def ufunc(self, op_name, device=None):
-        return self.uop_map(op_name, device=device)
+        return self.map_uop(op_name, device=device)
 
-    def uop_map(self, op_name, args=None, kwargs=None, device=None):
-        # This retains transpose.
-        block = self.copy()
-        block.dtype = array_utils.get_uop_output_type(op_name, self.dtype)
-        args = () if args is None else args
-        kwargs = {} if kwargs is None else kwargs
-        if device is None:
-            syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}
-        else:
-            syskwargs = {"device": device}
-        block._device = device
-        block.oid = self._km.map_uop(
-            op_name, self.oid, args, kwargs, syskwargs=syskwargs
-        )
-        return block
+    def block_from_scalar(self, other):
+        raise NotImplementedError()
 
-    def _block_from_other(self, other):
-        # Assume other is numeric.
-        # This only occurs during some numpy operations (e.g. np.mean),
-        # where a literal is used in the operation.
-        assert isinstance(other, (int, float, np.int, np.float))
-        block = Block(
-            self.grid_entry,
-            self.grid_shape,
-            (1,),
-            self.dtype,
-            False,
-            self._km,
-        )
-        # We pass syskwargs here for correct node placement for `other`,
-        # which should be local to self.
-        block.oid = self._km.put(
-            np.array(other, dtype=self.dtype),
-            syskwargs={
-                "grid_entry": self.grid_entry,
-                "grid_shape": self.grid_shape,
-            },
-        )
-        return block
+    def conjugate(self):
+        return self.ufunc("conjugate")
+
+    def sqrt(self):
+        return self.ufunc("sqrt")
 
     @staticmethod
     def block_meta(op, block1, block2, args):
@@ -227,77 +199,105 @@ def block_meta(op, block1, block2, args):
 
     @staticmethod
     def init_block(op_name, block1, block2, args, device=None):
-        result_grid_entry, result_grid_shape, result_shape, dtype = Block.block_meta(
-            op_name, block1, block2, args
-        )
-        block = Block(
-            grid_entry=result_grid_entry,
-            grid_shape=result_grid_shape,
-            shape=result_shape,
-            dtype=dtype,
-            transposed=False,
-            km=block1._km,
-        )
-        block._device = device
-        return block
+        raise NotImplementedError()
+
+    def _check_bop_implemented(self, other):
+        raise NotImplementedError()
+
+    @staticmethod
+    def binary_op(op_name, a, b, args: dict, device=None):
+        raise NotImplementedError()
 
     def bop(self, op_name, other, args: dict, device=None):
-        if not isinstance(other, Block):
-            other = self._block_from_other(other)
-        block: Block = self.init_block(op_name, self, other, args, device)
-        if device is None:
-            syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}
-        else:
-            syskwargs = {"device": device}
-        block.oid = self._km.bop(
-            op_name,
-            self.oid,
-            other.oid,
-            self.transposed,
-            other.transposed,
-            axes=args.get("axes"),
-            syskwargs=syskwargs,
-        )
-        return block
+        raise NotImplementedError()
 
     def tensordot(self, other, axes):
-        return self.bop("tensordot", other, args={"axes": axes})
+        raise NotImplementedError()
 
     def __add__(self, other):
-        return self.bop("add", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("add", self, other, args={})
+
+    def __radd__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("add", other, self, args={})
 
     def __sub__(self, other):
-        return self.bop("sub", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("sub", self, other, args={})
+
+    def __rsub__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("sub", other, self, args={})
 
     def __mul__(self, other):
-        return self.bop("mul", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("mul", self, other, args={})
 
-    def __matmul__(self, other):
-        return self.tensordot(other, axes=1)
+    def __rmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("mul", other, self, args={})
 
     def __truediv__(self, other):
-        return self.bop("truediv", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("truediv", self, other, args={})
+
+    def __rtruediv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("truediv", other, self, args={})
 
     def __pow__(self, other):
-        return self.bop("pow", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("pow", self, other, args={})
+
+    def __rpow__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("pow", other, self, args={})
+
+    def __matmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.tensordot(other, axes=1)
 
     def __ge__(self, other):
-        return self.bop("ge", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("ge", self, other, args={})
 
     def __gt__(self, other):
-        return self.bop("gt", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("gt", self, other, args={})
 
     def __le__(self, other):
-        return self.bop("le", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("le", self, other, args={})
 
     def __lt__(self, other):
-        return self.bop("lt", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("lt", self, other, args={})
 
     def __eq__(self, other):
-        return self.bop("eq", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("eq", self, other, args={})
 
     def __ne__(self, other):
-        return self.bop("ne", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("ne", self, other, args={})
 
     __iadd__ = __add__
     __isub__ = __sub__
@@ -306,24 +306,134 @@ def __ne__(self, other):
     __itruediv__ = __truediv__
     __ipow__ = __pow__
 
-    def astype(self, dtype):
+
+class Block(BlockBase):
+    # pylint: disable=redefined-builtin, global-statement
+
+    @property
+    def is_dense(self):
+        return True
+
+    @property
+    def nbytes(self):
+        return np.prod(self.shape) * np.dtype(self.dtype).itemsize
+
+    def __repr__(self):
+        return "Block(" + str(self.oid) + ")"
+
+    def copy(self, shallow=True):
+        assert shallow, "Only shallow copies are currently supported."
+        block = Block(
+            self.grid_entry,
+            self.grid_shape,
+            self.shape,
+            self.dtype,
+            self.transposed,
+            self.km,
+        )
+        block.oid = self.oid
+        return block
+
+    def map_uop(self, op_name, args=None, kwargs=None, device=None):
+        # This retains transpose.
         block = self.copy()
-        block.dtype = dtype
-        block.oid = self._km.astype(
-            self.oid,
-            dtype.__name__,
-            syskwargs={"grid_entry": block.grid_entry, "grid_shape": block.grid_shape},
+        block.dtype = array_utils.get_uop_output_type(op_name, self.dtype)
+        args = () if args is None else args
+        kwargs = {} if kwargs is None else kwargs
+        if device is None:
+            syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}
+        else:
+            syskwargs = {"device": device}
+        block._device = device
+        block.oid = self.km.map_uop(
+            op_name, self.oid, args, kwargs, syskwargs=syskwargs
         )
         return block
 
-    def conjugate(self):
-        return self.ufunc("conjugate")
+    def block_from_scalar(self, other):
+        # Assume other is numeric.
+        # This only occurs during some numpy operations (e.g. np.mean),
+        # where a literal is used in the operation.
+        assert array_utils.is_scalar(other)
+        block = Block(
+            self.grid_entry,
+            self.grid_shape,
+            (1,),
+            self.dtype,
+            False,
+            self.km,
+        )
+        # We pass syskwargs here for correct node placement for `other`,
+        # which should be local to self.
+        block.oid = self.km.put(
+            np.array(other, dtype=self.dtype),
+            syskwargs={
+                "grid_entry": self.grid_entry,
+                "grid_shape": self.grid_shape,
+            },
+        )
+        return block
 
-    def sqrt(self):
-        return self.ufunc("sqrt")
+    @staticmethod
+    def init_block(op_name, block1, block2, args, device=None):
+        (
+            result_grid_entry,
+            result_grid_shape,
+            result_shape,
+            dtype,
+        ) = BlockBase.block_meta(op_name, block1, block2, args)
+        block = Block(
+            grid_entry=result_grid_entry,
+            grid_shape=result_grid_shape,
+            shape=result_shape,
+            dtype=dtype,
+            transposed=False,
+            km=block1.km,
+        )
+        block._device = device
+        return block
 
-    def get(self):
-        return self._km.get(self.oid)
+    def _check_bop_implemented(self, other):
+        if isinstance(other, Block) or array_utils.is_scalar(other):
+            return True
+        return False
+
+    @staticmethod
+    def binary_op(op_name, a, b, args: dict, device=None):
+        if isinstance(a, Block) and array_utils.is_scalar(b):
+            b = a.block_from_scalar(b)
+        elif isinstance(b, Block) and array_utils.is_scalar(a):
+            a = b.block_from_scalar(a)
+        if not isinstance(a, Block) or not isinstance(b, Block):
+            raise NotImplementedError()
+
+        block: Block = a.init_block(op_name, a, b, args, device)
+        if device is None:
+            syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}
+        else:
+            syskwargs = {"device": device}
+        block.oid = a.km.bop(
+            op_name,
+            a.oid,
+            b.oid,
+            a.transposed,
+            b.transposed,
+            axes=args.get("axes"),
+            syskwargs=syskwargs,
+        )
+        return block
+
+    def bop(self, op_name, other, args: dict, device=None):
+        try:
+            return self.binary_op(op_name, self, other, args, device)
+        except NotImplementedError:
+            return other.binary_op(op_name, self, other, args, device)
+
+    def tensordot(self, other, axes):
+        try:
+            return self.binary_op("tensordot", self, other, args={"axes": axes})
+        except NotImplementedError:
+            return other.binary_op("tensordot", self, other, args={"axes": axes})
 
 
 class BlockArrayBase:
@@ -336,40 +446,31 @@ def __init__(self, grid: ArrayGrid, km: KernelManager, blocks: np.ndarray = None
         self.size = np.product(self.shape)
         self.ndim = len(self.shape)
         self.dtype = self.grid.dtype
-        try:
-            self.nbytes = self.grid.nbytes()
-        except ValueError as _:
-            self.nbytes = None
         self.blocks = blocks
-        if self.blocks is None:
-            # TODO (hme): Subclass np.ndarray for self.blocks instances,
-            #  and override key methods to better integrate with NumPy's ufuncs.
-            self.blocks = np.empty(shape=self.grid.grid_shape, dtype=Block)
-            for grid_entry in self.grid.get_entry_iterator():
-                self.blocks[grid_entry] = Block(
-                    grid_entry=grid_entry,
-                    grid_shape=self.grid.grid_shape,
-                    shape=self.grid.get_block_shape(grid_entry),
-                    dtype=self.dtype,
-                    transposed=False,
-                    km=self.km,
-                )
-        self.fill_value = None
-
-    @property
-    def nnz(self):
-        return np.prod(self.shape)
 
     @property
     def is_dense(self):
-        return self.fill_value is None
+        raise NotImplementedError()
 
     def __repr__(self):
         return "BlockArray(" + str(self.blocks) + ")"
 
+    def __getattr__(self, item):
+        if item == "__array_priority__" or item == "__array_struct__":
+            # This is triggered by a numpy array on the LHS.
+            raise TypeError(
+                "Unexpected conversion attempt from BlockArrayBase to ndarray."
+            )
+        elif item == "ndim":
+            return len(self.shape)
+        elif item == "T":
+            return self.transpose()
+        else:
+            raise NotImplementedError(item)
+
     def get(self) -> np.ndarray:
         result: np.ndarray = np.zeros(shape=self.grid.shape, dtype=self.grid.dtype)
-        block_shape: np.ndarray = np.array(self.grid.block_shape, dtype=np.int)
+        block_shape: np.ndarray = np.array(self.grid.block_shape, dtype=np.int64)
         arrays: list = self.km.get(
             [
                 self.blocks[grid_entry].oid
@@ -378,16 +479,83 @@ def get(self) -> np.ndarray:
         )
         for block_index, grid_entry in enumerate(self.grid.get_entry_iterator()):
             start = block_shape * grid_entry
-            entry_shape = np.array(self.grid.get_block_shape(grid_entry), dtype=np.int)
+            entry_shape = np.array(
+                self.grid.get_block_shape(grid_entry), dtype=np.int64
+            )
             end = start + entry_shape
             slices = tuple(map(lambda item: slice(*item), zip(*(start, end))))
-            block: Block = self.blocks[grid_entry]
+            block: BlockBase = self.blocks[grid_entry]
             arr: np.ndarray = arrays[block_index]
             if block.transposed:
                 arr = arr.T
             result[slices] = arr.reshape(block.shape)
         return result
 
+    def touch(self):
+        """
+        "Touch" an array. This is an efficient distributed "wait" operation.
+        """
+        oids = []
+        for grid_entry in self.grid.get_entry_iterator():
+            block: BlockBase = self.blocks[grid_entry]
+            oids.append(
+                self.km.touch(
+                    block.oid,
+                    syskwargs={
+                        "grid_entry": block.grid_entry,
+                        "grid_shape": block.grid_shape,
+                    },
+                )
+            )
+        self.km.get(oids)
+        return self
+
+    def copy(self):
+        raise NotImplementedError()
+
+    def astype(self, dtype):
+        raise NotImplementedError()
+
+    def flattened_oids(self):
+        oids = []
+        for grid_entry in self.grid.get_entry_iterator():
+            oid = self.blocks[grid_entry].oid
+            oids.append(oid)
+        return oids
+
+    @classmethod
+    def empty(cls, shape, block_shape, dtype, km: KernelManager):
+        raise NotImplementedError()
+
+    @staticmethod
+    def to_block_array(obj, km: KernelManager, block_shape=None):
+        raise NotImplementedError()
+
+    def transpose(self, defer=False, redistribute=False):
+        """
+        Transpose this matrix. Only use defer with arithmetic operations.
+        Setting redistribute to True may significantly impact performance.
+        :param defer: When true, the transpose operation will be applied
+        with the next arithmetic operation.
+        :param redistribute: If defer is false, setting this to true will
+        redistribute the data according to the device grid (data placement policy).
+        This parameter has no effect when defer is true.
+        :return: The transposed matrix.
+        """
+        if defer and redistribute:
+            warnings.warn("defer is True, redistribute=True will be ignored.")
+        metaT = self.grid.to_meta()
+        metaT["shape"] = tuple(reversed(metaT["shape"]))
+        metaT["block_shape"] = tuple(reversed(metaT["block_shape"]))
+        gridT = ArrayGrid.from_meta(metaT)
+        rarrT = type(self)(gridT, self.km)
+        rarrT.blocks = np.copy(self.blocks.T)
+        for grid_entry in rarrT.grid.get_entry_iterator():
+            rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose(
+                defer, redistribute
+            )
+        return rarrT
+
     def broadcast_to(self, shape):
         b = array_utils.broadcast(self.shape, shape)
         result_block_shape = array_utils.broadcast_block_shape(
@@ -410,3 +578,286 @@ def broadcast_to(self, shape):
             broadcast = it.itviews[0]
         result.blocks = broadcast
         return result
+
+    def reduce_axis(self, op_name, axis, keepdims=False):
+        raise NotImplementedError()
+
+    def tree_reduce(
+        self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape, *args
+    ):
+        raise NotImplementedError()
+
+    def check_or_convert_other(self, other, compute_block_shape=False):
+        raise NotImplementedError()
+
+    def _check_bop_implemented(self, other):
+        raise NotImplementedError()
+
+    # All operators: https://docs.python.org/3/library/operator.html
+
+    #################
+    # Unary functions
+    #################
+
+    def ufunc(self, op_name):
+        raise NotImplementedError()
+
+    def __neg__(self):
+        return self.ufunc("negative")
+
+    def __pos__(self):
+        return self
+
+    def __abs__(self):
+        return self.ufunc("abs")
+
+    def __invert__(self):
+        return self.ufunc("invert")
+
+    #################
+    # Arithmetic
+    #################
+
+    @staticmethod
+    def elementwise(op_name, a, b):
+        raise NotImplementedError()
+
+    def __mod__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("mod", self, other)
+
+    def __rmod__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("mod", other, self)
+
+    __imod__ = __mod__
+
+    def __add__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("add", self, other)
+
+    def __radd__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("add", other, self)
+
+    __iadd__ = __add__
+
+    def __sub__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("sub", self, other)
+
+    def __rsub__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("sub", other, self)
+
+    __isub__ = __sub__
+
+    def __mul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("mul", self, other)
+
+    def __rmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("mul", other, self)
+
+    __imul__ = __mul__
+
+    def __truediv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("truediv", self, other)
+
+    def __rtruediv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("truediv", other, self)
+
+    __itruediv__ = __truediv__
+
+    def __floordiv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("floor_divide", self, other)
+
+    def __rfloordiv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("floor_divide", other, self)
+
+    __ifloordiv__ = __floordiv__
+
+    def __pow__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("pow", self, other)
+
+    def __rpow__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("pow", other, self)
+
+    __ipow__ = __pow__
+
+    ##################
+    # Boolean
+    ##################
+
+    # TODO (hme): Type check bool ops.
+    def __bool__(self):
+        # pylint: disable=no-member
+        if np.sum(self.shape) == len(self.shape):
+            # If all ones or scalar, then this is defined.
+            return self.get().__bool__()
+        return True
+
+    def __or__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_or", self, other)
+
+    def __ror__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_or", other, self)
+
+    __ior__ = __or__
+
+    def __and__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_and", self, other)
+
+    def __rand__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_and", other, self)
+
+    __iand__ = __and__
+
+    def __xor__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_xor", self, other)
+
+    def __rxor__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_xor", other, self)
+
+    __ixor__ = __xor__
+
+    def __lshift__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("left_shift", self, other)
+
+    def __rlshift__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("left_shift", other, self)
+
+    __ilshift__ = __lshift__
+
+    def __rshift__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("right_shift", self, other)
+
+    def __rrshift__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("right_shift", other, self)
+
+    __irshift__ = __rshift__
+
+    #################
+    # Linear Algebra
+    #################
+
+    def _compute_tensordot_syskwargs(
+        self, self_block: BlockBase, other_block: BlockBase
+    ):
+        # Schedule on larger block.
+        if np.product(self_block.shape) >= np.product(other_block.shape):
+            return self_block.true_grid_entry(), self_block.true_grid_shape()
+        else:
+            return other_block.true_grid_entry(), other_block.true_grid_shape()
+
+    @staticmethod
+    def tensordot(a, b, axes=2):
+        raise NotImplementedError()
+
+    def __matmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        if len(self.shape) > 2:
+            # TODO (bcp): NumPy's implementation does a stacked matmul, which is not supported yet.
+            raise NotImplementedError(
+                "Matrix multiply for tensors of rank > 2 not supported yet."
+            )
+        else:
+            return self.tensordot(self, other, 1)
+
+    def __rmatmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.tensordot(other, self, 1)
+
+    __imatmul__ = __matmul__
+
+    #################
+    # Inequalities
+    #################
+
+    def __inequality__(self, op_name, other):
+        raise NotImplementedError()
+
+    def __ge__(self, other):
+        return self.__inequality__("ge", other)
+
+    def __rge__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("ge", self)
+
+    def __gt__(self, other):
+        return self.__inequality__("gt", other)
+
+    def __rgt__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("gt", self)
+
+    def __le__(self, other):
+        return self.__inequality__("le", other)
+
+    def __rle__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("le", self)
+
+    def __lt__(self, other):
+        return self.__inequality__("lt", other)
+
+    def __rlt__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("lt", self)
+
+    def __eq__(self, other):
+        return self.__inequality__("eq", other)
+
+    def __req__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("eq", self)
+
+    def __ne__(self, other):
+        return self.__inequality__("ne", other)
+
+    def __rne__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("ne", self)
diff --git a/nums/core/array/blockarray.py b/nums/core/array/blockarray.py
index 46a7daf3..d6dabd6a 100644
--- a/nums/core/array/blockarray.py
+++ b/nums/core/array/blockarray.py
@@ -17,10 +17,9 @@
 import itertools
 
 import numpy as np
-import nums.core.array.sparse as array_sparse
 
 from nums.core.array import utils as array_utils
-from nums.core.array.base import BlockArrayBase, Block
+from nums.core.array.base import BlockBase, Block, BlockArrayBase
 from nums.core.array.view import ArrayView
 from nums.core.grid.grid import ArrayGrid
 from nums.core.kernel.kernel_manager import KernelManager
@@ -30,6 +29,32 @@
 
 
 class BlockArray(BlockArrayBase):
+    def __init__(self, grid: ArrayGrid, km: KernelManager, blocks: np.ndarray = None):
+        if blocks is not None:
+            assert blocks.dtype == Block, "BlockArray must be initialized with Blocks"
+        super().__init__(grid, km, blocks)
+        try:
+            self.nbytes = self.grid.nbytes()
+        except ValueError as _:
+            self.nbytes = None
+        if self.blocks is None:
+            # TODO (hme): Subclass np.ndarray for self.blocks instances,
+            #  and override key methods to better integrate with NumPy's ufuncs.
+            self.blocks = np.empty(shape=self.grid.grid_shape, dtype=Block)
+            for grid_entry in self.grid.get_entry_iterator():
+                self.blocks[grid_entry] = Block(
+                    grid_entry=grid_entry,
+                    grid_shape=self.grid.grid_shape,
+                    shape=self.grid.get_block_shape(grid_entry),
+                    dtype=self.dtype,
+                    transposed=False,
+                    km=self.km,
+                )
+
+    @property
+    def is_dense(self):
+        return True
+
     @classmethod
     def empty(cls, shape, block_shape, dtype, km: KernelManager):
         return BlockArray.create("empty", shape, block_shape, dtype, km)
@@ -115,24 +140,12 @@ def copy(self):
             rarr_copy.blocks[grid_entry] = self.blocks[grid_entry].copy()
         return rarr_copy
 
-    def touch(self):
-        """
-        "Touch" an array. This is an efficient distributed "wait" operation.
-        """
-        oids = []
-        for grid_entry in self.grid.get_entry_iterator():
-            block: Block = self.blocks[grid_entry]
-            oids.append(
-                self.km.touch(
-                    block.oid,
-                    syskwargs={
-                        "grid_entry": block.grid_entry,
-                        "grid_shape": block.grid_shape,
-                    },
-                )
-            )
-        self.km.get(oids)
-        return self
+    def astype(self, dtype):
+        grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__)
+        result = BlockArray(grid, self.km)
+        for grid_entry in result.grid.get_entry_iterator():
+            result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype)
+        return result
 
     def is_single_block(self):
         return self.blocks.size == 1
@@ -223,42 +236,6 @@ def swapaxes(self, axis1, axis2):
         rarr_swap = BlockArray(grid_swap, self.km, rarr_src)
         return rarr_swap
 
-    def transpose(self, defer=False, redistribute=False):
-        """
-        Transpose this matrix. Only use defer with arithmetic operations.
-        Setting redistribute to True may significantly impact performance.
-        :param defer: When true, the transpose operation will be applied
-        with the next arithmetic operation.
-        :param redistribute: If defer is false, setting this to true will
-        redistribute the data according to the device grid (data placement policy).
-        This parameter has no effect when defer is true.
-        :return: The transposed matrix.
-        """
-        if defer and redistribute:
-            warnings.warn("defer is True, redistribute=True will be ignored.")
-        metaT = self.grid.to_meta()
-        metaT["shape"] = tuple(reversed(metaT["shape"]))
-        metaT["block_shape"] = tuple(reversed(metaT["block_shape"]))
-        gridT = ArrayGrid.from_meta(metaT)
-        rarrT = BlockArray(gridT, self.km)
-        rarrT.blocks = np.copy(self.blocks.T)
-        for grid_entry in rarrT.grid.get_entry_iterator():
-            rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose(
-                defer, redistribute
-            )
-        return rarrT
-
-    def __getattr__(self, item):
-        if item == "__array_priority__" or item == "__array_struct__":
-            # This is triggered by a numpy array on the LHS.
-            raise TypeError("Unexpected conversion attempt from BlockArray to ndarray.")
-        elif item == "ndim":
-            return len(self.shape)
-        elif item == "T":
-            return self.transpose()
-        else:
-            raise NotImplementedError(item)
-
     def _preprocess_subscript(self, item):
         if not isinstance(item, tuple):
             ss = (item,)
@@ -267,7 +244,7 @@ def _preprocess_subscript(self, item):
         # We need to fetch any block arrays.
         tmp = []
         for entry in ss:
-            if isinstance(entry, BlockArray):
+            if isinstance(entry, BlockArrayBase):
                 val = entry.get()
             else:
                 val = entry
@@ -323,7 +300,7 @@ def __getitem__(self, item):
 
         av: ArrayView = ArrayView.from_block_array(self)
         # TODO (hme): We don't have to create, but do so for now until we need to optimize.
-        return av[ss].create(BlockArray)
+        return av[ss].create()
 
     def _advanced_single_array_select(self, ss: tuple, axis: int = 0):
         # Create output array along the axis of the selection operation.
@@ -355,7 +332,7 @@ def _advanced_single_array_select(self, ss: tuple, axis: int = 0):
                 shape.append(self.shape[i])
                 block_shape.append(self.block_shape[i])
 
-        dst_arr = BlockArray(
+        dst_arr = type(self)(
             ArrayGrid(
                 shape=tuple(shape),
                 block_shape=tuple(block_shape),
@@ -424,10 +401,11 @@ def _advanced_single_array_select(self, ss: tuple, axis: int = 0):
         return dst_arr
 
     def __setitem__(self, key, value):
-        value: BlockArray = BlockArray.to_block_array(value, self.km)
+        value: BlockArrayBase = self.to_block_array(value, self.km)
         ss, is_handled_advanced, axis = self._preprocess_subscript(key)
         if is_handled_advanced:
             return self._advanced_single_array_assign(ss, value, axis)
+
         av: ArrayView = ArrayView.from_block_array(self)
         av[key] = value
 
@@ -445,7 +423,7 @@ def _advanced_single_array_assign(
         # 2. value is scalar or 1-dimensional.
         # We currently don't support the case where value may broadcasted if it has more dims.
         # This should be a straight-forward future task.
-        value: BlockArray = value
+        value: BlockArrayBase = value
         mode = None
         if len(value.shape) == 0:
             # subscripted value per block will broadcast to other dimensions.
@@ -526,7 +504,7 @@ def _advanced_single_array_assign(
             },
         )
         for dst_grid_entry in dst_arr.grid.get_entry_iterator():
-            dst_block: Block = dst_arr.blocks[dst_grid_entry]
+            dst_block: BlockBase = dst_arr.blocks[dst_grid_entry]
             dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry)
 
             # Make sure index values in subscript are within bounds of dst_arr.
@@ -553,7 +531,7 @@ def _advanced_single_array_assign(
                 continue
 
             if mode == "scalar":
-                src_block: Block = src_arr.blocks.item()
+                src_block: BlockBase = src_arr.blocks.item()
                 src_coord: tuple = src_arr.grid.get_entry_coordinates(
                     src_block.grid_entry
                 )
@@ -571,7 +549,7 @@ def _advanced_single_array_assign(
                 )
             elif mode == "single-dim":
                 for src_grid_entry in src_arr.grid.get_entry_iterator():
-                    src_block: Block = src_arr.blocks[src_grid_entry]
+                    src_block: BlockBase = src_arr.blocks[src_grid_entry]
                     src_coord: tuple = src_arr.grid.get_entry_coordinates(
                         src_grid_entry
                     )
@@ -597,7 +575,7 @@ def _advanced_single_array_assign(
                         + [j]
                         + list(dst_grid_entry[axis + 1 :])
                     )
-                    src_block: Block = src_arr.blocks[src_grid_entry]
+                    src_block: BlockBase = src_arr.blocks[src_grid_entry]
                     src_coord: tuple = src_arr.grid.get_entry_coordinates(
                         src_grid_entry
                     )
@@ -635,60 +613,19 @@ def check_or_convert_other(self, other, compute_block_shape=False):
         block_shape = None if compute_block_shape else self.block_shape
         return BlockArray.to_block_array(other, self.km, block_shape=block_shape)
 
+    def _check_bop_implemented(self, other):
+        if isinstance(other, (BlockArray, np.ndarray, list)) or array_utils.is_scalar(
+            other
+        ):
+            return True
+        return False
+
     def ufunc(self, op_name):
         result = self.copy()
         for grid_entry in self.grid.get_entry_iterator():
             result.blocks[grid_entry] = self.blocks[grid_entry].ufunc(op_name)
         return result
 
-    def tree_reduce(
-        self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape
-    ):
-        """
-        Basic tree reduce imp.
-        Schedules op on same node as left operand.
-        :param op_name: The reduction op.
-        :param blocks_or_oids: A list of type Block or a list of tuples.
-                               Tuples must be of the form
-                               (oid, grid_entry, grid_shape, transposed)
-        :param result_grid_entry: The grid entry of the result block. This will be used
-                                  to compute the final reduction step.
-        :param result_grid_shape: The grid entry of the result block. This will be used
-                                  to compute the final reduction step.
-        :return: The oid of the result.
-        """
-        oid_list = blocks_or_oids
-        if isinstance(blocks_or_oids[0], Block):
-            oid_list = [
-                (b.oid, b.grid_entry, b.grid_shape, b.transposed)
-                for b in blocks_or_oids
-            ]
-        if len(oid_list) == 1:
-            return oid_list[0][0]
-        q = oid_list
-        while len(q) > 1:
-            a_oid, a_ge, a_gs, a_T = q.pop(0)
-            b_oid, _, _, b_T = q.pop(0)
-            ge, gs = (
-                (result_grid_entry, result_grid_shape) if len(q) == 0 else (a_ge, a_gs)
-            )
-            c_oid = self.km.bop_reduce(
-                op_name,
-                a_oid,
-                b_oid,
-                a_T,
-                b_T,
-                syskwargs={
-                    "grid_entry": ge,
-                    "grid_shape": gs,
-                },
-            )
-            q.append((c_oid, ge, gs, False))
-        r_oid, r_ge, r_gs, _ = q.pop(0)
-        assert r_ge == result_grid_entry
-        assert r_gs == result_grid_shape
-        return r_oid
-
     def reduce_axis(self, op_name, axis, keepdims=False):
         if not (axis is None or isinstance(axis, (int, np.int32, np.int64))):
             raise NotImplementedError("Only integer axis is currently supported.")
@@ -769,18 +706,115 @@ def reduce_axis(self, op_name, axis, keepdims=False):
                 )
         return result
 
+    def tree_reduce(
+        self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape, *args
+    ):
+        """
+        Basic tree reduce imp.
+        Schedules op on same node as left operand.
+        :param op_name: The reduction op.
+        :param blocks_or_oids: A list of type Block or a list of tuples.
+        Tuples must be of the form
+        (oid, grid_entry, grid_shape, transposed)
+        :param result_grid_entry: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :param result_grid_shape: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :return: The oid of the result.
+        """
+        oid_list = blocks_or_oids
+        if isinstance(blocks_or_oids[0], Block):
+            oid_list = [
+                (b.oid, b.grid_entry, b.grid_shape, b.transposed)
+                for b in blocks_or_oids
+            ]
+        if len(oid_list) == 1:
+            return oid_list[0][0]
+        q = oid_list
+        while len(q) > 1:
+            a_oid, a_ge, a_gs, a_T = q.pop(0)
+            b_oid, _, _, b_T = q.pop(0)
+            ge, gs = (
+                (result_grid_entry, result_grid_shape) if len(q) == 0 else (a_ge, a_gs)
+            )
+            c_oid = self.km.bop_reduce(
+                op_name,
+                a_oid,
+                b_oid,
+                a_T,
+                b_T,
+                syskwargs={
+                    "grid_entry": ge,
+                    "grid_shape": gs,
+                },
+            )
+            q.append((c_oid, ge, gs, False))
+        r_oid, r_ge, r_gs, _ = q.pop(0)
+        assert r_ge == result_grid_entry
+        assert r_gs == result_grid_shape
+        return r_oid
+
     #################
-    # Linear Algebra
+    # Arithmetic
     #################
 
-    def _compute_tensordot_syskwargs(self, self_block: Block, other_block: Block):
-        # Schedule on larger block.
-        if np.product(self_block.shape) >= np.product(other_block.shape):
-            return self_block.true_grid_entry(), self_block.true_grid_shape()
+    @staticmethod
+    def elementwise(op_name, a, b):
+        if isinstance(a, BlockArray):
+            b = a.check_or_convert_other(b)
+        elif isinstance(b, BlockArray):
+            a = b.check_or_convert_other(a)
         else:
-            return other_block.true_grid_entry(), other_block.true_grid_shape()
+            raise NotImplementedError()
+
+        if a.shape == b.shape and a.block_shape == b.block_shape:
+            return BlockArray._fast_elementwise(op_name, a, b)
+        blocks_op = a.blocks.__getattribute__("__%s__" % op_name)
+        return BlockArray.from_blocks(blocks_op(b.blocks), result_shape=None, km=a.km)
+
+    @staticmethod
+    def _fast_elementwise(op_name, a, b):
+        """
+        Implements fast scheduling for basic element-wise operations.
+        """
+        dtype = array_utils.get_bop_output_type(op_name, a.dtype, b.dtype)
+        # Schedule the op first.
+        blocks = np.empty(shape=a.grid.grid_shape, dtype=Block)
+        for grid_entry in a.grid.get_entry_iterator():
+            a_block: Block = a.blocks[grid_entry]
+            b_block: Block = b.blocks[grid_entry]
+            blocks[grid_entry] = block = Block(
+                grid_entry=grid_entry,
+                grid_shape=a_block.grid_shape,
+                shape=a_block.shape,
+                dtype=dtype,
+                transposed=False,
+                km=a.km,
+            )
+            block.oid = a.km.bop(
+                op_name,
+                a_block.oid,
+                b_block.oid,
+                a_block.transposed,
+                b_block.transposed,
+                axes={},
+                syskwargs={
+                    "grid_entry": grid_entry,
+                    "grid_shape": a.grid.grid_shape,
+                },
+            )
+        return BlockArray(
+            ArrayGrid(a.shape, a.block_shape, dtype.__name__),
+            a.km,
+            blocks=blocks,
+        )
 
-    def tensordot(self, other, axes=2):
+    #################
+    # Linear Algebra
+    #################
+
+    @staticmethod
+    def tensordot(a, b, axes=2):
         if isinstance(axes, int):
             pass
         elif array_utils.is_array_like(axes):
@@ -788,59 +822,59 @@ def tensordot(self, other, axes=2):
         else:
             raise TypeError(f"Unexpected axes type '{type(axes).__name__}'")
 
-        other = self.check_or_convert_other(other, compute_block_shape=True)
+        if isinstance(a, BlockArray):
+            b = a.check_or_convert_other(b, compute_block_shape=True)
+        elif isinstance(b, BlockArray):
+            a = b.check_or_convert_other(a, compute_block_shape=True)
+        else:
+            raise NotImplementedError()
 
-        if array_utils.np_tensordot_param_test(
-            self.shape, self.ndim, other.shape, other.ndim, axes
-        ):
+        if array_utils.np_tensordot_param_test(a.shape, a.ndim, b.shape, b.ndim, axes):
             raise ValueError("shape-mismatch for sum")
 
         if axes > 0:
-            this_axes = self.grid.grid_shape[:-axes]
-            this_sum_axes = self.grid.grid_shape[-axes:]
-            other_axes = other.grid.grid_shape[axes:]
-            other_sum_axes = other.grid.grid_shape[:axes]
-            assert this_sum_axes == other_sum_axes
-            result_shape = tuple(self.shape[:-axes] + other.shape[axes:])
-            result_block_shape = tuple(
-                self.block_shape[:-axes] + other.block_shape[axes:]
-            )
+            a_axes = a.grid.grid_shape[:-axes]
+            a_sum_axes = a.grid.grid_shape[-axes:]
+            b_axes = b.grid.grid_shape[axes:]
+            b_sum_axes = b.grid.grid_shape[:axes]
+            assert a_sum_axes == b_sum_axes
+            result_shape = tuple(a.shape[:-axes] + b.shape[axes:])
+            result_block_shape = tuple(a.block_shape[:-axes] + b.block_shape[axes:])
         else:
-            this_axes = self.grid.grid_shape
-            other_axes = other.grid.grid_shape
-            this_sum_axes = ()
-            result_shape = tuple(self.shape + other.shape)
-            result_block_shape = tuple(self.block_shape + other.block_shape)
+            a_axes = a.grid.grid_shape
+            b_axes = b.grid.grid_shape
+            a_sum_axes = ()
+            result_shape = tuple(a.shape + b.shape)
+            result_block_shape = tuple(a.block_shape + b.block_shape)
 
         result_grid = ArrayGrid(
             shape=result_shape,
             block_shape=result_block_shape,
             dtype=array_utils.get_bop_output_type(
-                "tensordot", self.dtype, other.dtype
+                "tensordot", a.dtype, b.dtype
             ).__name__,
         )
-        assert result_grid.grid_shape == tuple(this_axes + other_axes)
-        result = BlockArray(result_grid, self.km)
-        this_dims = list(itertools.product(*map(range, this_axes)))
-        other_dims = list(itertools.product(*map(range, other_axes)))
-        sum_dims = list(itertools.product(*map(range, this_sum_axes)))
-        for i in this_dims:
-            for j in other_dims:
+        assert result_grid.grid_shape == tuple(a_axes + b_axes)
+        result = BlockArray(result_grid, a.km)
+        a_dims = list(itertools.product(*map(range, a_axes)))
+        b_dims = list(itertools.product(*map(range, b_axes)))
+        sum_dims = list(itertools.product(*map(range, a_sum_axes)))
+        for i in a_dims:
+            for j in b_dims:
                 grid_entry = tuple(i + j)
                 result_block: Block = result.blocks[grid_entry]
                 sum_oids = []
                 for k in sum_dims:
-                    self_block: Block = self.blocks[tuple(i + k)]
-                    other_block: Block = other.blocks[tuple(k + j)]
-                    dot_grid_args = self._compute_tensordot_syskwargs(
-                        self_block, other_block
-                    )
-                    dotted_oid = self.km.bop(
+                    a_block: Block = a.blocks[tuple(i + k)]
+                    b_block: Block = b.blocks[tuple(k + j)]
+                    # pylint: disable=protected-access
+                    dot_grid_args = a._compute_tensordot_syskwargs(a_block, b_block)
+                    dotted_oid = a.km.bop(
                         "tensordot",
-                        self_block.oid,
-                        other_block.oid,
-                        self_block.transposed,
-                        other_block.transposed,
+                        a_block.oid,
+                        b_block.oid,
+                        a_block.transposed,
+                        b_block.transposed,
                         axes=axes,
                         syskwargs={
                             "grid_entry": dot_grid_args[0],
@@ -850,177 +884,19 @@ def tensordot(self, other, axes=2):
                     sum_oids.append(
                         (dotted_oid, dot_grid_args[0], dot_grid_args[1], False)
                     )
-                result_block.oid = self.tree_reduce(
+                result_block.oid = a.tree_reduce(
                     "sum", sum_oids, result_block.grid_entry, result_block.grid_shape
                 )
         return result
 
-    def __matmul__(self, other):
-        if len(self.shape) > 2:
-            # TODO (bcp): NumPy's implementation does a stacked matmul, which is not supported yet.
-            raise NotImplementedError(
-                "Matrix multiply for tensors of rank > 2 not supported yet."
-            )
-        else:
-            return self.tensordot(other, 1)
-
-    def __rmatmul__(self, other):
-        other = self.check_or_convert_other(other)
-        return other @ self
-
-    __imatmul__ = __matmul__
-
-    #################
-    # Arithmetic
-    #################
-
-    def _fast_element_wise(self, op_name, other):
-        """
-        Implements fast scheduling for basic element-wise operations.
-        """
-        dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
-        # Schedule the op first.
-        blocks = np.empty(shape=self.grid.grid_shape, dtype=Block)
-        for grid_entry in self.grid.get_entry_iterator():
-            self_block: Block = self.blocks[grid_entry]
-            other_block: Block = other.blocks[grid_entry]
-            blocks[grid_entry] = block = Block(
-                grid_entry=grid_entry,
-                grid_shape=self_block.grid_shape,
-                shape=self_block.shape,
-                dtype=dtype,
-                transposed=False,
-                km=self.km,
-            )
-            block.oid = self.km.bop(
-                op_name,
-                self_block.oid,
-                other_block.oid,
-                self_block.transposed,
-                other_block.transposed,
-                axes={},
-                syskwargs={
-                    "grid_entry": grid_entry,
-                    "grid_shape": self.grid.grid_shape,
-                },
-            )
-        return BlockArray(
-            ArrayGrid(self.shape, self.block_shape, dtype.__name__),
-            self.km,
-            blocks=blocks,
-        )
-
-    def __elementwise__(self, op_name, other):
-        other = self.check_or_convert_other(other)
-        if self.shape == other.shape and self.block_shape == other.block_shape:
-            return self._fast_element_wise(op_name, other)
-        blocks_op = self.blocks.__getattribute__("__%s__" % op_name)
-        return BlockArray.from_blocks(
-            blocks_op(other.blocks), result_shape=None, km=self.km
-        )
-
-    def __neg__(self):
-        return self.ufunc("negative")
-
-    def __pos__(self):
-        return self
-
-    def __abs__(self):
-        return self.ufunc("abs")
-
-    def __mod__(self, other):
-        return self.__elementwise__("mod", other)
-
-    def __rmod__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("mod", self)
-
-    __imod__ = __mod__
-
-    def __add__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("add", other)
-
-    def __radd__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("add", self)
-
-    __iadd__ = __add__
-
-    def __sub__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("sub", other)
-
-    def __rsub__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("sub", self)
-
-    __isub__ = __sub__
-
-    def __mul__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("mul", other)
-
-    def __rmul__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("mul", self)
-
-    __imul__ = __mul__
-
-    def __truediv__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("truediv", other)
-
-    def __rtruediv__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other / self
-
-    __itruediv__ = __truediv__
-
-    def __floordiv__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("floor_divide", other)
-
-    def __rfloordiv__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("floor_divide", self)
-
-    __ifloordiv__ = __floordiv__
-
-    def __pow__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("pow", other)
-
-    def __rpow__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other**self
-
-    __ipow__ = __pow__
-
     #################
     # Inequalities
     #################
 
-    def __inequality__(self, op, other):
+    def __inequality__(self, op_name, other):
         other = self.check_or_convert_other(other)
+        if other is NotImplemented:
+            return NotImplemented
         assert (
             other.shape == () or other.shape == self.shape
         ), "Currently supports comparison with scalars only."
@@ -1037,129 +913,10 @@ def __inequality__(self, op, other):
             else:
                 other_block: Block = other.blocks[grid_entry]
             result.blocks[grid_entry] = self.blocks[grid_entry].bop(
-                op, other_block, args={}
+                op_name, other_block, args={}
             )
-
         return result
 
-    def __ge__(self, other):
-        return self.__inequality__("ge", other)
-
-    def __rge__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("ge", self)
-
-    def __gt__(self, other):
-        return self.__inequality__("gt", other)
-
-    def __rgt__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("gt", self)
-
-    def __le__(self, other):
-        return self.__inequality__("le", other)
-
-    def __rle__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("le", self)
-
-    def __lt__(self, other):
-        return self.__inequality__("lt", other)
-
-    def __rlt__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("lt", self)
-
-    def __eq__(self, other):
-        return self.__inequality__("eq", other)
-
-    def __req__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("eq", self)
-
-    def __ne__(self, other):
-        return self.__inequality__("ne", other)
-
-    def __rne__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("ne", self)
-
-    ##################
-    # Boolean
-    ##################
-
-    # TODO (hme): Type check bool ops.
-    def __bool__(self):
-        # pylint: disable=no-member
-        if np.sum(self.shape) == len(self.shape):
-            # If all ones or scalar, then this is defined.
-            return self.get().__bool__()
-        return True
-
-    def __invert__(self):
-        return self.ufunc("invert")
-
-    def __or__(self, other):
-        return self.__elementwise__("bitwise_or", other)
-
-    def __ror__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("bitwise_or", self)
-
-    __ior__ = __or__
-
-    def __and__(self, other):
-        return self.__elementwise__("bitwise_and", other)
-
-    def __rand__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("bitwise_and", self)
-
-    __iand__ = __and__
-
-    def __xor__(self, other):
-        return self.__elementwise__("bitwise_xor", other)
-
-    def __rxor__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("bitwise_xor", self)
-
-    __ixor__ = __xor__
-
-    def __lshift__(self, other):
-        return self.__elementwise__("left_shift", other)
-
-    def __rlshift__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("left_shift", self)
-
-    __ilshift__ = __lshift__
-
-    def __rshift__(self, other):
-        return self.__elementwise__("right_shift", other)
-
-    def __rrshift__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("right_shift", self)
-
-    __irshift__ = __rshift__
-
-    # All operators: https://docs.python.org/3/library/operator.html
-
-    def astype(self, dtype):
-        grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__)
-        result = BlockArray(grid, self.km)
-        for grid_entry in result.grid.get_entry_iterator():
-            result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype)
-        return result
-
-    def flattened_oids(self):
-        oids = []
-        for grid_entry in self.grid.get_entry_iterator():
-            oid = self.blocks[grid_entry].oid
-            oids.append(oid)
-        return oids
-
 
 class Reshape:
     @staticmethod
@@ -1239,7 +996,7 @@ def _arbitrary_reshape(self, arr: BlockArray, shape, block_shape) -> BlockArray:
         # Generate index mappings per block, and group source indices to minimize
         # RPCs and generation of new objects.
         km = arr.km
-        dst_arr = BlockArray.empty(
+        dst_arr = type(arr).empty(
             shape=shape, block_shape=block_shape, dtype=arr.dtype, km=km
         )
         for dst_grid_entry in dst_arr.grid.get_entry_iterator():
@@ -1268,7 +1025,7 @@ def _arbitrary_reshape(self, arr: BlockArray, shape, block_shape) -> BlockArray:
         return dst_arr
 
     def _block_shape_reshape(self, arr, block_shape):
-        rarr: BlockArray = BlockArray.empty(arr.shape, block_shape, arr.dtype, arr.km)
+        rarr: BlockArray = type(arr).empty(arr.shape, block_shape, arr.dtype, arr.km)
         for grid_entry in rarr.grid.get_entry_iterator():
             grid_entry_slice = rarr.grid.get_slice(grid_entry)
             # TODO (hme): This could be less costly.
diff --git a/nums/core/array/random.py b/nums/core/array/random.py
index 3452860d..5e417270 100644
--- a/nums/core/array/random.py
+++ b/nums/core/array/random.py
@@ -220,23 +220,21 @@ def sparse_randint(
         self,
         low,
         high=None,
-        dtype=int,
+        dtype=None,
         shape=None,
         block_shape=None,
         p=0.01,
-        fill_value=0,
     ):
         if dtype is None:
-            dtype = np.float64
+            dtype = np.int64
         assert isinstance(dtype, type)
         return self._sparse_sample_basic(
             "randint",
+            {"low": low, "high": high, "dtype": dtype},
             shape,
             block_shape,
             dtype,
-            {"low": low, "high": high, "dtype": dtype},
             p,
-            fill_value,
         )
 
     def sparse_uniform(
@@ -247,16 +245,14 @@ def sparse_uniform(
         block_shape=None,
         dtype=None,
         p=0.01,
-        fill_value=0,
     ):
         return self._sparse_sample_basic(
             "uniform",
+            {"low": low, "high": high},
             shape,
             block_shape,
             dtype,
-            {"low": low, "high": high},
             p,
-            fill_value,
         )
 
     def sparse_normal(
@@ -267,27 +263,24 @@ def sparse_normal(
         block_shape=None,
         dtype=None,
         p=0.01,
-        fill_value=0,
     ):
         return self._sparse_sample_basic(
             "normal",
+            {"loc": loc, "scale": scale},
             shape,
             block_shape,
             dtype,
-            {"loc": loc, "scale": scale},
             p,
-            fill_value,
         )
 
     def _sparse_sample_basic(
         self,
         rfunc_name,
+        rfunc_args: Dict,
         shape,
         block_shape,
         dtype,
-        rfunc_args: Dict,
         p,
-        fill_value,
     ) -> SparseBlockArray:
         if shape is None:
             assert block_shape is None
@@ -300,7 +293,7 @@ def _sparse_sample_basic(
         assert isinstance(dtype, type)
         assert "size" not in rfunc_args
         grid: ArrayGrid = ArrayGrid(shape, block_shape, dtype=dtype.__name__)
-        sba: SparseBlockArray = SparseBlockArray(grid, self._km, fill_value)
+        sba: SparseBlockArray = SparseBlockArray(grid, self._km)
         for grid_entry in sba.grid.get_entry_iterator():
             rng_params = list(self._rng.new_block_rng_params())
             this_block_shape = grid.get_block_shape(grid_entry)
@@ -316,8 +309,8 @@ def _sparse_sample_basic(
                 this_block_shape,
                 dtype,
                 p,
-                fill_value,
                 syskwargs=syskwargs,
             )
+            # pylint: disable=protected-access
             block._nnz = sba.km.sparse_nnz(block.oid, syskwargs=syskwargs)
         return sba
diff --git a/nums/core/array/sparse.py b/nums/core/array/sparse.py
index fb71b0bd..118edcdc 100644
--- a/nums/core/array/sparse.py
+++ b/nums/core/array/sparse.py
@@ -1,17 +1,20 @@
 from typing import List
+import itertools
+import warnings
+import numpy as np
+import sparse
+
 from nums.core.array import utils as array_utils
-from nums.core.array.base import BlockArrayBase, Block
+from nums.core.array.base import BlockBase, Block, BlockArrayBase
 from nums.core.array.blockarray import BlockArray
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.core.grid.grid import ArrayGrid
-import numpy as np
-import itertools
-import sparse
 
 
-# TODO: merge with Block
-# TODO: RHS binary operations
-class SparseBlock(Block):
+# pylint: disable=protected-access, redefined-builtin
+
+
+class SparseBlock(BlockBase):
     def __init__(
         self,
         grid_entry,
@@ -20,23 +23,23 @@ def __init__(
         dtype,
         transposed,
         km: KernelManager,
-        fill_value=0,
         id=None,
         index_dtype=np.int64,
     ):
         super().__init__(grid_entry, grid_shape, shape, dtype, transposed, km, id)
-        self.fill_value = fill_value
         self.index_dtype = index_dtype
         self.oid = None
         self._nnz: object = None
-        self._nbytes: object = (
-            None  # TODO: implement as lazily fetched exact result, same as nnz
-        )
+        self._nbytes: object = None
+
+    @property
+    def is_dense(self):
+        return False
 
     @property
     def nnz(self):
         if self._nnz is None:
-            self._nnz = self._km.sparse_nnz(
+            self._nnz = self.km.sparse_nnz(
                 self.oid,
                 syskwargs={
                     "grid_entry": self.grid_entry,
@@ -44,30 +47,26 @@ def nnz(self):
                 },
             )
         if not array_utils.is_int(self._nnz):
-            self._nnz = self._km.get(self._nnz)
+            self._nnz = self.km.get(self._nnz)
         return self._nnz
 
-    # TODO: implement as lazily fetched exact result
     @property
     def nbytes(self):
-        return self._estimate_nbytes(format="coo")
-
-    # TODO: deprecate
-    def _estimate_nbytes(self, format=None):
-        if format is None:
-            return self.nnz
-        elif format == "coo":
-            return (
-                self.nnz * np.dtype(self.dtype).itemsize
-                + self.nnz * self.ndim * np.dtype(self.index_dtype).itemsize
+        if self._nbytes is None:
+            self._nbytes = self.km.sparse_nbytes(
+                self.oid,
+                syskwargs={
+                    "grid_entry": self.grid_entry,
+                    "grid_shape": self.grid_shape,
+                },
             )
+        if not array_utils.is_int(self._nbytes):
+            self._nbytes = self.km.get(self._nbytes)
+        return self._nbytes
 
     def __repr__(self):
         return f"SparseBlock({self.oid})"
 
-    def size(self):
-        return np.product(self.shape)
-
     def copy(self, shallow=True):
         assert shallow, "Only shallow copies are currently supported."
         block = SparseBlock(
@@ -76,40 +75,24 @@ def copy(self, shallow=True):
             self.shape,
             self.dtype,
             self.transposed,
-            self._km,
+            self.km,
         )
         block.oid = self.oid
         return block
 
-    def transpose(self, defer=False, redistribute=False):
-        grid_entryT = tuple(reversed(self.grid_entry))
-        grid_shapeT = tuple(reversed(self.grid_shape))
-        blockT = SparseBlock(
-            grid_entry=grid_entryT,
-            grid_shape=grid_shapeT,
-            shape=tuple(reversed(self.shape)),
-            dtype=self.dtype,
-            transposed=not self.transposed,
-            km=self._km,
-        )
-        blockT.oid = self.oid
-        if not defer:
-            blockT.transposed = False
-            if redistribute:
-                syskwargs = {"grid_entry": grid_entryT, "grid_shape": grid_shapeT}
-            else:
-                syskwargs = {
-                    "grid_entry": self.grid_entry,
-                    "grid_shape": self.grid_shape,
-                }
-            blockT.oid = self._km.transpose(self.oid, syskwargs=syskwargs)
-        return blockT
-
-    def ufunc(self, op_name, device=None):
-        return self.uop_map(op_name, device=device)
-
-    def uop_map(self, op_name, args=None, kwargs=None, device=None):
-        block = self.copy()
+    def map_uop(self, op_name, args=None, kwargs=None, device=None):
+        densify = array_utils.get_sparse_uop_densify(op_name)
+        if densify:
+            block = Block(
+                self.grid_entry,
+                self.grid_shape,
+                self.shape,
+                self.dtype,
+                self.transposed,
+                self.km,
+            )
+        else:
+            block = self.copy()
         block.dtype = array_utils.get_uop_output_type(op_name, self.dtype)
         args = () if args is None else args
         kwargs = {} if kwargs is None else kwargs
@@ -118,32 +101,48 @@ def uop_map(self, op_name, args=None, kwargs=None, device=None):
         else:
             syskwargs = {"device": device}
         block._device = device
-        block.oid = self._km.sparse_map_uop(
-            op_name, self.oid, args, kwargs, syskwargs=syskwargs
+        block.oid = self.km.sparse_map_uop(
+            op_name, self.oid, args, kwargs, densify, syskwargs=syskwargs
         )
-        block._nnz = self._km.sparse_nnz(block.oid, syskwargs=syskwargs)
-        block.fill_value = np.__getattribute__(op_name)(self.fill_value)
+        block._nnz = self.km.sparse_nnz(block.oid, syskwargs=syskwargs)
+        block._nbytes = self.km.sparse_nbytes(block.oid, syskwargs=syskwargs)
         return block
 
-    def _block_from_scalar(self, other):
+    def block_from_scalar(self, other):
         assert array_utils.is_scalar(other)
-        block = SparseBlock(
-            self.grid_entry,
-            self.grid_shape,
-            (1,),
-            self.dtype,
-            False,
-            self._km,
-            fill_value=other,
-        )
-        # TODO: generalize for different kernels
-        block.oid = self._km.put(
-            sparse.COO.from_numpy(np.array(other), fill_value=other),
-            syskwargs={
-                "grid_entry": self.grid_entry,
-                "grid_shape": self.grid_shape,
-            },
-        )
+        # Construct sparse block only if value is 0.
+        if np.isclose(other, 0):
+            block = SparseBlock(
+                self.grid_entry,
+                self.grid_shape,
+                (1,),
+                self.dtype,
+                False,
+                self.km,
+            )
+            block.oid = self.km.sparse_block_from_scalar(
+                other,
+                syskwargs={
+                    "grid_entry": self.grid_entry,
+                    "grid_shape": self.grid_shape,
+                },
+            )
+        else:
+            block = Block(
+                self.grid_entry,
+                self.grid_shape,
+                (1,),
+                self.dtype,
+                False,
+                self.km,
+            )
+            block.oid = self.km.block_from_scalar(
+                other,
+                syskwargs={
+                    "grid_entry": self.grid_entry,
+                    "grid_shape": self.grid_shape,
+                },
+            )
         return block
 
     @staticmethod
@@ -153,10 +152,7 @@ def init_block(op_name, block1, block2, args, device=None):
             result_grid_shape,
             result_shape,
             dtype,
-        ) = Block.block_meta(op_name, block1, block2, args)
-        fill_value = array_utils.get_bop_fill_value(
-            op_name, block1.fill_value, block2.fill_value
-        )
+        ) = BlockBase.block_meta(op_name, block1, block2, args)
         # TODO: what happens when different index_dtype?
         block = SparseBlock(
             grid_entry=result_grid_entry,
@@ -164,22 +160,30 @@ def init_block(op_name, block1, block2, args, device=None):
             shape=result_shape,
             dtype=dtype,
             transposed=False,
-            fill_value=fill_value,
-            km=block1._km,
+            km=block1.km,
         )
         block._device = device
         return block
 
-    def bop(self, op_name, other, args: dict, device=None):
-        if not isinstance(other, Block):
-            other = self._block_from_scalar(other)
-        densify = array_utils.get_sparse_bop_return_type(
-            op_name, self.fill_value, other.fill_value
-        )
+    def _check_bop_implemented(self, other):
+        if isinstance(other, BlockBase) or array_utils.is_scalar(other):
+            return True
+        return False
+
+    @staticmethod
+    def binary_op(op_name, a: BlockBase, b: BlockBase, args: dict, device=None):
+        if isinstance(a, SparseBlock) and array_utils.is_scalar(b):
+            b = a.block_from_scalar(b)
+        elif isinstance(b, SparseBlock) and array_utils.is_scalar(a):
+            a = b.block_from_scalar(a)
+        if not isinstance(a, BlockBase) or not isinstance(b, BlockBase):
+            raise NotImplementedError()
+
+        densify = array_utils.get_sparse_bop_densify(op_name, a.is_dense, b.is_dense)
         if densify:
-            block = Block.init_block(op_name, self, other, args, device)
+            block = Block.init_block(op_name, a, b, args, device)
         else:
-            block = SparseBlock.init_block(op_name, self, other, args, device)
+            block = SparseBlock.init_block(op_name, a, b, args, device)
         if device is None:
             syskwargs = {
                 "grid_entry": block.grid_entry,
@@ -187,46 +191,40 @@ def bop(self, op_name, other, args: dict, device=None):
             }
         else:
             syskwargs = {"device": device}
-        block.oid = self._km.sparse_bop(
+        block.oid = a.km.sparse_bop(
             op_name,
-            self.oid,
-            other.oid,
-            self.transposed,
-            other.transposed,
+            a.oid,
+            b.oid,
+            a.transposed,
+            b.transposed,
             axes=args.get("axes"),
             densify=densify,
             syskwargs=syskwargs,
         )
         if not densify:
-            block._nnz = self._km.sparse_nnz(block.oid, syskwargs=syskwargs)
+            block._nnz = a.km.sparse_nnz(block.oid, syskwargs=syskwargs)
+            block._nbytes = a.km.sparse_nbytes(block.oid, syskwargs=syskwargs)
         return block
 
-    # TODO: densify when fill_value != 0
+    def bop(self, op_name, other, args: dict, device=None):
+        return self.binary_op(op_name, self, other, args, device)
+
     def tensordot(self, other, axes):
-        assert self.fill_value == 0
-        if not other.is_dense:
-            assert other.fill_value == 0
-        return self.bop("tensordot", other, args={"axes": axes})
+        return self.binary_op("tensordot", self, other, args={"axes": axes})
 
 
-# TODO: merge with BlockArray, have decorator for dense-only methods?
-class SparseBlockArray(BlockArray):
+class SparseBlockArray(BlockArrayBase):
     def __init__(
         self,
         grid: ArrayGrid,
         km: KernelManager,
-        fill_value=0,
         blocks: np.ndarray = None,
     ):
-        self.grid = grid
-        self.km = km
-        self.shape = self.grid.shape
-        self.block_shape = self.grid.block_shape
-        self.grid_shape = self.grid.grid_shape
-        self.size = np.product(self.shape)
-        self.ndim = len(self.shape)
-        self.dtype = self.grid.dtype
-        self.blocks = blocks
+        if blocks is not None:
+            assert (
+                blocks.dtype == SparseBlock
+            ), "SparseBlockArray must be initialized with SparseBlocks"
+        super().__init__(grid, km, blocks)
         if self.blocks is None:
             self.blocks = np.empty(shape=self.grid_shape, dtype=SparseBlock)
             for grid_entry in self.grid.get_entry_iterator():
@@ -237,19 +235,17 @@ def __init__(
                     self.dtype,
                     False,
                     self.km,
-                    fill_value,
                 )
-        self.fill_value = fill_value
         self._nnz = -1
         self._nbytes = -1
 
     @property
-    def nnz(self):
-        return self._get_nnz()
+    def is_dense(self):
+        return False
 
     @property
-    def nbytes(self):
-        return self._get_nbytes()
+    def nnz(self):
+        return self._get_nnz()
 
     def _get_nnz(self):
         if self._nnz == -1:
@@ -258,6 +254,10 @@ def _get_nnz(self):
                 self._nnz += self.blocks[grid_entry].nnz
         return self._nnz
 
+    @property
+    def nbytes(self):
+        return self._get_nbytes()
+
     def _get_nbytes(self):
         if self._nbytes == -1:
             self._nbytes = 0
@@ -265,11 +265,32 @@ def _get_nbytes(self):
                 self._nbytes += self.blocks[grid_entry].nbytes
         return self._nbytes
 
+    def __repr__(self):
+        return f"SparseBlockArray({self.blocks})"
+
+    @classmethod
+    def empty(cls, shape, block_shape, dtype, km: KernelManager):
+        return SparseBlockArray.create("empty", shape, block_shape, dtype, km)
+
+    @classmethod
+    def create(cls, create_op_name, shape, block_shape, dtype, km: KernelManager):
+        grid = ArrayGrid(shape=shape, block_shape=block_shape, dtype=dtype.__name__)
+        grid_meta = grid.to_meta()
+        arr = SparseBlockArray(grid, km)
+        for grid_entry in grid.get_entry_iterator():
+            arr.blocks[grid_entry].oid = km.new_sparse_block(
+                create_op_name,
+                grid_entry,
+                grid_meta,
+                syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape},
+            )
+        return arr
+
     @classmethod
-    def from_np(cls, arr, block_shape, copy, km, fill_value=0):
+    def from_np(cls, arr, block_shape, copy, km):
         dtype_str = str(arr.dtype)
         grid = ArrayGrid(arr.shape, block_shape, dtype_str)
-        rarr = SparseBlockArray(grid, km, fill_value)
+        rarr = SparseBlockArray(grid, km)
         grid_entry_iterator = grid.get_entry_iterator()
         for grid_entry in grid_entry_iterator:
             grid_slice = grid.get_slice(grid_entry)
@@ -277,7 +298,7 @@ def from_np(cls, arr, block_shape, copy, km, fill_value=0):
             if copy:
                 block = np.copy(block)
             # TODO: generalize for different kernels
-            block = sparse.COO.from_numpy(block, fill_value)
+            block = sparse.COO.from_numpy(block, fill_value=0)
             rarr.blocks[grid_entry].oid = km.put(
                 block,
                 syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape},
@@ -286,10 +307,10 @@ def from_np(cls, arr, block_shape, copy, km, fill_value=0):
         return rarr
 
     @classmethod
-    def from_sparse(cls, arr, block_shape, copy, km, fill_value=0):
+    def from_sparse(cls, arr, block_shape, copy, km):
         dtype_str = str(arr.dtype)
         grid = ArrayGrid(arr.shape, block_shape, dtype_str)
-        rarr = SparseBlockArray(grid, km, fill_value)
+        rarr = SparseBlockArray(grid, km)
         grid_entry_iterator = grid.get_entry_iterator()
         for grid_entry in grid_entry_iterator:
             grid_slice = grid.get_slice(grid_entry)
@@ -306,14 +327,16 @@ def from_sparse(cls, arr, block_shape, copy, km, fill_value=0):
 
     @classmethod
     def from_scalar(cls, val, km):
-        if not array_utils.is_scalar(val):
-            raise ValueError("%s is not a scalar." % val)
-        return SparseBlockArray.from_np(
-            np.array(val), (), copy=False, km=km, fill_value=val
-        )
+        # Only create SparseBlockArray with 0s. Other scalars should use dense BlockArray.
+        if not np.isclose(val, 0):
+            warnings.warn(
+                "%s cannot fill SparseBlockArray. Converting to BlockArray." % val
+            )
+            return BlockArray.from_np(np.array(val), (), copy=False, km=km)
+        return SparseBlockArray.from_np(np.array(val), (), copy=False, km=km)
 
     @classmethod
-    def from_blocks(cls, arr: np.ndarray, result_shape, km, fill_value):
+    def from_blocks(cls, arr: np.ndarray, result_shape, km):
         sample_idx = tuple(0 for _ in arr.shape)
         if isinstance(arr, SparseBlock):
             sample_block = arr
@@ -328,7 +351,7 @@ def from_blocks(cls, arr: np.ndarray, result_shape, km, fill_value):
             shape=result_shape, block_shape=result_block_shape, dtype=result_dtype_str
         )
         assert arr.shape == result_grid.grid_shape
-        result = SparseBlockArray(result_grid, km, fill_value)
+        result = SparseBlockArray(result_grid, km)
         for grid_entry in result_grid.get_entry_iterator():
             if isinstance(arr, SparseBlock):
                 block: SparseBlock = arr
@@ -338,12 +361,12 @@ def from_blocks(cls, arr: np.ndarray, result_shape, km, fill_value):
         return result
 
     @classmethod
-    def from_ba(cls, ba: BlockArrayBase, fill_value=0):
+    def from_ba(cls, ba: BlockArrayBase):
         assert (
             ba.shape != ()
         ), "from_ba does not support scalar BlockArray. Use from_scalar."
         grid = ArrayGrid(ba.shape, ba.block_shape, ba.dtype.__name__)
-        sba = SparseBlockArray(grid, ba.km, fill_value)
+        sba = SparseBlockArray(grid, ba.km)
         for grid_entry in grid.get_entry_iterator():
             block: Block = ba.blocks[grid_entry]
             sblock: SparseBlock = sba.blocks[grid_entry]
@@ -353,11 +376,10 @@ def from_ba(cls, ba: BlockArrayBase, fill_value=0):
             }
             sblock.oid = sba.km.dense_to_sparse(
                 block.oid,
-                fill_value,
                 syskwargs=syskwargs,
             )
             sblock._nnz = sba.km.sparse_nnz(sblock.oid, syskwargs=syskwargs)
-        sba.fill_value = fill_value
+            sblock._nbytes = sba.km.sparse_nbytes(sblock.oid, syskwargs=syskwargs)
         return sba
 
     def to_ba(self):
@@ -378,11 +400,18 @@ def todense(self) -> BlockArray:
 
     def copy(self):
         grid_copy = self.grid.from_meta(self.grid.to_meta())
-        rarr_copy = SparseBlockArray(grid_copy, self.km, self.fill_value)
+        rarr_copy = SparseBlockArray(grid_copy, self.km)
         for grid_entry in grid_copy.get_entry_iterator():
             rarr_copy.blocks[grid_entry] = self.blocks[grid_entry].copy()
         return rarr_copy
 
+    def astype(self, dtype):
+        grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__)
+        result = BlockArray(grid, self.km)
+        for grid_entry in result.grid.get_entry_iterator():
+            result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype)
+        return result
+
     @staticmethod
     def to_block_array(obj, km: KernelManager, block_shape=None):
         if isinstance(obj, (BlockArray, SparseBlockArray)):
@@ -397,176 +426,270 @@ def to_block_array(obj, km: KernelManager, block_shape=None):
             raise Exception("Unsupported type %s" % type(obj))
         if block_shape is None:
             block_shape = km.get_block_shape(np_array.shape, np_array.dtype)
+        # Assume object is dense.
         return BlockArray.from_np(np_array, block_shape, False, km)
 
-    def transpose(self, defer=False, redistribute=False):
-        if defer and redistribute:
-            warnings.warn("defer is True, redistribute=True will be ignored.")
-        metaT = self.grid.to_meta()
-        metaT["shape"] = tuple(reversed(metaT["shape"]))
-        metaT["block_shape"] = tuple(reversed(metaT["block_shape"]))
-        gridT = ArrayGrid.from_meta(metaT)
-        rarrT = SparseBlockArray(gridT, self.km, self.fill_value)
-        rarrT.blocks = np.copy(self.blocks.T)
-        for grid_entry in rarrT.grid.get_entry_iterator():
-            rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose(
-                defer, redistribute
-            )
-        return rarrT
-
     def check_or_convert_other(self, other, compute_block_shape=False):
         block_shape = None if compute_block_shape else self.block_shape
         return SparseBlockArray.to_block_array(other, self.km, block_shape=block_shape)
 
+    def _check_bop_implemented(self, other):
+        if isinstance(
+            other, (BlockArrayBase, np.ndarray, list)
+        ) or array_utils.is_scalar(other):
+            return True
+        return False
+
     def ufunc(self, op_name):
-        result = self.copy()
+        densify = array_utils.get_sparse_uop_densify(op_name)
+        if densify:
+            result = BlockArray(self.grid, self.km)
+        else:
+            result = self.copy()
         for grid_entry in self.grid.get_entry_iterator():
             result.blocks[grid_entry] = self.blocks[grid_entry].ufunc(op_name)
-        func = np.__getattribute__(op_name)
-        result.fill_value = func(self.fill_value)
         result._nnz = -1
+        result._nbytes = -1
         return result
 
-    def sdtp(self, *block_arrays: List[BlockArray]):
-        """
-        Perform a sampled tensor product among an arbitrary number of block arrays.
-        """
-        assert np.allclose(self.fill_value, 0)
-        for i, ba in enumerate(block_arrays):
-            assert len(ba.shape) == 1
-            assert ba.shape[0] == self.shape[i]
-            assert ba.block_shape[0] == self.block_shape[i]
-        # Sparsity of result is same as self.
-        result: SparseBlockArray = SparseBlockArray(self.grid, self.km, self.fill_value)
+    def reduce_axis(self, op_name, axis, keepdims=False):
+        if not (axis is None or isinstance(axis, (int, np.int32, np.int64))):
+            raise NotImplementedError("Only integer axis is currently supported.")
+        if 0 in self.shape:
+            return SparseBlockArray.create("zeros", (), (), float, self.km)
+        block_reduced_oids = np.empty_like(self.blocks, dtype=tuple)
         for grid_entry in self.grid.get_entry_iterator():
-            dense_oids = [
-                ba.blocks[grid_entry[i]].oid for i, ba in enumerate(block_arrays)
-            ]
-            result.blocks[grid_entry].oid = self.km.sdtp(
-                self.blocks[grid_entry].oid,
-                *dense_oids,
-                syskwargs={"grid_entry": grid_entry, "grid_shape": result.grid_shape},
+            block = self.blocks[grid_entry]
+            block_oid = self.km.sparse_reduce_axis(
+                op_name=op_name,
+                arr=block.oid,
+                axis=axis,
+                keepdims=keepdims,
+                transposed=block.transposed,
+                syskwargs={
+                    "grid_entry": block.grid_entry,
+                    "grid_shape": block.grid_shape,
+                },
             )
-        return result
-
-    def sdtd(self, x: BlockArray, y: BlockArray, axes: int):
-        """
-        Perform a sampled dense-dense tensor dot between two tensors x and y.
-        In addition to several compatibility-related constraints to ensure
-        the operation is actually valid, we also require that the fill value of
-        the sparse array is 0, and that the axes over which the tensor dot
-        is being performed are not partitioned.
-        The last constraint can be eliminated if we add a sampled element-wise kernel.
-        """
-        assert np.allclose(self.fill_value, 0)
-
-        if array_utils.np_tensordot_param_test(x.shape, x.ndim, y.shape, y.ndim, axes):
-            raise ValueError("shape-mismatch for sum")
-
-        x_axes = x.grid.grid_shape[:-axes]
-        x_sum_axes = x.grid.grid_shape[-axes:]
-        y_axes = y.grid.grid_shape[axes:]
-        y_sum_axes = y.grid.grid_shape[:axes]
-        assert x_sum_axes == y_sum_axes
-        result_shape = tuple(x.shape[:-axes] + y.shape[axes:])
-        result_block_shape = tuple(x.block_shape[:-axes] + y.block_shape[axes:])
+            block_reduced_oids[grid_entry] = (
+                block_oid,
+                block.grid_entry,
+                block.grid_shape,
+                False,
+            )
+        result_shape = []
+        result_block_shape = []
+        for curr_axis in range(len(self.shape)):
+            axis_size, axis_block_size = (
+                self.shape[curr_axis],
+                self.block_shape[curr_axis],
+            )
+            if curr_axis == axis or axis is None:
+                if keepdims:
+                    axis_size, axis_block_size = 1, 1
+                else:
+                    continue
+            result_shape.append(axis_size)
+            result_block_shape.append(axis_block_size)
+        result_shape = tuple(result_shape)
+        result_block_shape = tuple(result_block_shape)
+        result_dtype = array_utils.get_reduce_output_type(op_name, self.dtype)
         result_grid = ArrayGrid(
             shape=result_shape,
             block_shape=result_block_shape,
-            dtype=array_utils.get_bop_output_type(
-                "tensordot", x.dtype, y.dtype
-            ).__name__,
+            dtype=result_dtype.__name__,
         )
-        # Ensure dims over which we sum are not partitioned.
-        # This is currently required for scalable sampled dense.
-        assert np.sum(x_sum_axes) == np.sum(y_sum_axes) == axes
+        result = BlockArray(result_grid, self.km)
 
-        assert result_grid.grid_shape == tuple(x_axes + y_axes)
-        assert result_grid.grid_shape == self.grid_shape
-        assert result_grid.block_shape == self.block_shape
-        result: SparseBlockArray = SparseBlockArray(self.grid, self.km, self.fill_value)
-        this_dims = list(itertools.product(*map(range, x_axes)))
-        other_dims = list(itertools.product(*map(range, y_axes)))
-        sum_dims = tuple([0] * axes)
-        for i in this_dims:
-            for j in other_dims:
-                grid_entry = tuple(i + j)
-                x_block: Block = x.blocks[tuple(i + sum_dims)]
-                y_block: Block = y.blocks[tuple(sum_dims + j)]
-                result.blocks[grid_entry].oid = self.km.sdtd(
-                    self.blocks[grid_entry].oid,
-                    x_block.oid,
-                    y_block.oid,
-                    axes=axes,
+        if axis is None:
+            if result.shape == ():
+                result_block: Block = result.blocks[()]
+            else:
+                result_block: Block = result.blocks[:].item()
+            result_block.oid = self.tree_reduce(
+                op_name,
+                block_reduced_oids.flatten().tolist(),
+                result_block.grid_entry,
+                result_block.grid_shape,
+                False,
+            )
+        else:
+            for result_grid_entry in result_grid.get_entry_iterator():
+                block_reduced_oids_axis = []
+                for sum_dim in range(self.grid.grid_shape[axis]):
+                    grid_entry = list(result_grid_entry)
+                    if keepdims:
+                        grid_entry[axis] = sum_dim
+                    else:
+                        grid_entry = grid_entry[:axis] + [sum_dim] + grid_entry[axis:]
+                    grid_entry = tuple(grid_entry)
+                    block_reduced_oids_axis.append(block_reduced_oids[grid_entry])
+                result_block: Block = result.blocks[result_grid_entry]
+                result_block.oid = self.tree_reduce(
+                    op_name,
+                    block_reduced_oids_axis,
+                    result_block.grid_entry,
+                    result_block.grid_shape,
+                    False,
+                )
+        return result
+
+    # pylint: disable=arguments-differ
+    def tree_reduce(
+        self,
+        op_name,
+        blocks_or_oids,
+        result_grid_entry,
+        result_grid_shape,
+        densify,
+        *args,
+    ):
+        """
+        Basic tree reduce imp.
+        Schedules op on same node as left operand.
+        :param op_name: The reduction op.
+        :param blocks_or_oids: A list of type Block or a list of tuples.
+        Tuples must be of the form
+        (oid, grid_entry, grid_shape, transposed)
+        :param result_grid_entry: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :param result_grid_shape: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :return: The oid of the result.
+        """
+        oid_list = blocks_or_oids
+        if isinstance(blocks_or_oids[0], Block):
+            oid_list = [
+                (b.oid, b.grid_entry, b.grid_shape, b.transposed)
+                for b in blocks_or_oids
+            ]
+        if len(oid_list) == 1:
+            return oid_list[0][0]
+        q = oid_list
+        if densify:
+            while len(q) > 1:
+                a_oid, a_ge, a_gs, a_T = q.pop(0)
+                b_oid, _, _, b_T = q.pop(0)
+                ge, gs = (
+                    (result_grid_entry, result_grid_shape)
+                    if len(q) == 0
+                    else (a_ge, a_gs)
+                )
+                c_oid = self.km.bop_reduce(
+                    op_name,
+                    a_oid,
+                    b_oid,
+                    a_T,
+                    b_T,
                     syskwargs={
-                        "grid_entry": grid_entry,
-                        "grid_shape": result.grid_shape,
+                        "grid_entry": ge,
+                        "grid_shape": gs,
                     },
                 )
-        return result
+                q.append((c_oid, ge, gs, False))
+        else:
+            while len(q) > 1:
+                a_oid, a_ge, a_gs, a_T = q.pop(0)
+                b_oid, _, _, b_T = q.pop(0)
+                ge, gs = (
+                    (result_grid_entry, result_grid_shape)
+                    if len(q) == 0
+                    else (a_ge, a_gs)
+                )
+                c_oid = self.km.sparse_bop_reduce(
+                    op_name,
+                    a_oid,
+                    b_oid,
+                    a_T,
+                    b_T,
+                    syskwargs={
+                        "grid_entry": ge,
+                        "grid_shape": gs,
+                    },
+                )
+                q.append((c_oid, ge, gs, False))
+        r_oid, r_ge, r_gs, _ = q.pop(0)
+        assert r_ge == result_grid_entry
+        assert r_gs == result_grid_shape
+        return r_oid
+
+    #################
+    # Arithmetic
+    #################
 
-    def _fast_elementwise(self, op_name, other, densify):
-        dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
+    @staticmethod
+    def elementwise(op_name, a, b):
+        if isinstance(a, SparseBlockArray):
+            b = a.check_or_convert_other(b)
+        elif isinstance(b, SparseBlockArray):
+            a = b.check_or_convert_other(a)
+        else:
+            raise NotImplementedError()
+
+        densify = array_utils.get_sparse_bop_densify(op_name, a.is_dense, b.is_dense)
+        if a.shape == b.shape and a.block_shape == b.block_shape:
+            return SparseBlockArray._fast_elementwise(op_name, a, b, densify)
+        else:
+            blocks_op = a.blocks.__getattribute__(f"__{op_name}__")
+            if densify:
+                result = BlockArray.from_blocks(
+                    blocks_op(b.blocks),
+                    result_shape=None,
+                    km=a.km,
+                )
+            else:
+                result = SparseBlockArray.from_blocks(
+                    blocks_op(b.blocks),
+                    result_shape=None,
+                    km=a.km,
+                )
+            return result
+
+    @staticmethod
+    def _fast_elementwise(op_name, a, b, densify):
+        # a, b have the same grid_shape and block_shape
+        dtype = array_utils.get_bop_output_type(op_name, a.dtype, b.dtype)
         if densify:
-            blocks = np.empty(shape=self.grid_shape, dtype=Block)
+            block_type = Block
         else:
-            blocks = np.empty(shape=self.grid_shape, dtype=SparseBlock)
-        for grid_entry in self.grid.get_entry_iterator():
-            self_block: SparseBlock = self.blocks[grid_entry]
-            other_block: Block = other.blocks[grid_entry]
-            blocks[grid_entry] = block = self_block.bop(op_name, other_block, args={})
-            block.oid = self.km.sparse_bop(
+            block_type = SparseBlock
+        blocks = np.empty(shape=a.grid_shape, dtype=block_type)
+        for grid_entry in a.grid.get_entry_iterator():
+            a_block: BlockBase = a.blocks[grid_entry]
+            b_block: BlockBase = b.blocks[grid_entry]
+            blocks[grid_entry] = block_type(
+                grid_entry,
+                a_block.grid_shape,
+                a_block.shape,
+                dtype,
+                transposed=False,
+                km=a.km,
+            )
+            blocks[grid_entry].oid = a.km.sparse_bop(
                 op_name,
-                self_block.oid,
-                other_block.oid,
-                self_block.transposed,
-                other_block.transposed,
+                a_block.oid,
+                b_block.oid,
+                a_block.transposed,
+                b_block.transposed,
                 axes={},
                 densify=densify,
                 syskwargs={
                     "grid_entry": grid_entry,
-                    "grid_shape": self.grid.grid_shape,
+                    "grid_shape": a.grid.grid_shape,
                 },
             )
-        grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__)
+        grid = ArrayGrid(a.shape, a.block_shape, dtype.__name__)
         if densify:
-            return BlockArray(grid, self.km, blocks=blocks)
+            return BlockArray(grid, a.km, blocks=blocks)
         else:
-            fill_value = array_utils.get_bop_fill_value(
-                op_name, self.fill_value, other.fill_value
-            )
-            result = SparseBlockArray(grid, self.km, fill_value, blocks=blocks)
-            return result
+            return SparseBlockArray(grid, a.km, blocks=blocks)
 
-    def __elementwise__(self, op_name, other):
-        other = self.check_or_convert_other(other)
-        densify = array_utils.get_sparse_bop_return_type(
-            op_name,
-            self.fill_value,
-            other.fill_value,
-        )
-        if self.shape == other.shape and self.block_shape == other.block_shape:
-            return self._fast_elementwise(op_name, other, densify)
-        blocks_op = self.blocks.__getattribute__("__%s__" % op_name)
-        if densify:
-            result = BlockArray.from_blocks(
-                blocks_op(other.blocks),
-                result_shape=None,
-                km=self.km,
-            )
-        else:
-            fill_value = array_utils.get_bop_fill_value(
-                op_name, self.fill_value, other.fill_value
-            )
-            result = SparseBlockArray.from_blocks(
-                blocks_op(other.blocks),
-                result_shape=None,
-                fill_value=fill_value,
-                km=self.km,
-            )
-        return result
+    #################
+    # Linear Algebra
+    #################
 
-    def tensordot(self, other, axes=2):
+    @staticmethod
+    def tensordot(a, b, axes=2):
         if isinstance(axes, int):
             pass
         elif array_utils.is_array_like(axes):
@@ -574,73 +697,65 @@ def tensordot(self, other, axes=2):
         else:
             raise TypeError(f"Unexpected axes type '{type(axes).__name__}'")
 
-        other = self.check_or_convert_other(other, compute_block_shape=True)
+        if isinstance(a, SparseBlockArray):
+            b = a.check_or_convert_other(b, compute_block_shape=True)
+        elif isinstance(b, SparseBlockArray):
+            a = b.check_or_convert_other(a, compute_block_shape=True)
+        else:
+            raise NotImplementedError()
 
-        if array_utils.np_tensordot_param_test(
-            self.shape, self.ndim, other.shape, other.ndim, axes
-        ):
+        if array_utils.np_tensordot_param_test(a.shape, a.ndim, b.shape, b.ndim, axes):
             raise ValueError("shape-mismatch for sum")
 
-        # Pydata/Sparse only works with fill_value == 0
-        # TODO: densify when fill_value != 0
-        assert self.fill_value == 0
-        if isinstance(other, SparseBlockArray):
-            assert other.fill_value == 0, "Sparse-dense tensordot may not be tractable."
-        densify = array_utils.get_sparse_bop_return_type(
-            "tensordot",
-            self.fill_value,
-            other.fill_value,
+        densify = array_utils.get_sparse_bop_densify(
+            "tensordot", a.is_dense, b.is_dense
         )
 
         if axes > 0:
-            this_axes = self.grid.grid_shape[:-axes]
-            this_sum_axes = self.grid.grid_shape[-axes:]
-            other_axes = other.grid.grid_shape[axes:]
-            other_sum_axes = other.grid.grid_shape[:axes]
-            assert this_sum_axes == other_sum_axes
-            result_shape = tuple(self.shape[:-axes] + other.shape[axes:])
-            result_block_shape = tuple(
-                self.block_shape[:-axes] + other.block_shape[axes:]
-            )
+            a_axes = a.grid.grid_shape[:-axes]
+            a_sum_axes = a.grid.grid_shape[-axes:]
+            b_axes = b.grid.grid_shape[axes:]
+            b_sum_axes = b.grid.grid_shape[:axes]
+            assert a_sum_axes == b_sum_axes
+            result_shape = tuple(a.shape[:-axes] + b.shape[axes:])
+            result_block_shape = tuple(a.block_shape[:-axes] + b.block_shape[axes:])
         else:
-            this_axes = self.grid.grid_shape
-            other_axes = other.grid.grid_shape
-            this_sum_axes = ()
-            result_shape = tuple(self.shape + other.shape)
-            result_block_shape = tuple(self.block_shape + other.block_shape)
+            a_axes = a.grid.grid_shape
+            b_axes = b.grid.grid_shape
+            a_sum_axes = ()
+            result_shape = tuple(a.shape + b.shape)
+            result_block_shape = tuple(a.block_shape + b.block_shape)
 
         result_grid = ArrayGrid(
             shape=result_shape,
             block_shape=result_block_shape,
             dtype=array_utils.get_bop_output_type(
-                "tensordot", self.dtype, other.dtype
+                "tensordot", a.dtype, b.dtype
             ).__name__,
         )
-        assert result_grid.grid_shape == tuple(this_axes + other_axes)
+        assert result_grid.grid_shape == tuple(a_axes + b_axes)
         if densify:
-            result = BlockArray(result_grid, self.km)
+            result = BlockArray(result_grid, a.km)
         else:
-            result = SparseBlockArray(result_grid, self.km, self.fill_value)
-        this_dims = list(itertools.product(*map(range, this_axes)))
-        other_dims = list(itertools.product(*map(range, other_axes)))
-        sum_dims = list(itertools.product(*map(range, this_sum_axes)))
-        for i in this_dims:
-            for j in other_dims:
+            result = SparseBlockArray(result_grid, a.km)
+        a_dims = list(itertools.product(*map(range, a_axes)))
+        b_dims = list(itertools.product(*map(range, b_axes)))
+        sum_dims = list(itertools.product(*map(range, a_sum_axes)))
+        for i in a_dims:
+            for j in b_dims:
                 grid_entry = tuple(i + j)
                 result_block: Block = result.blocks[grid_entry]
                 sum_oids = []
                 for k in sum_dims:
-                    self_block: Block = self.blocks[tuple(i + k)]
-                    other_block: Block = other.blocks[tuple(k + j)]
-                    dot_grid_args = self._compute_tensordot_syskwargs(
-                        self_block, other_block
-                    )
-                    dotted_oid = self.km.sparse_bop(
+                    a_block: Block = a.blocks[tuple(i + k)]
+                    b_block: Block = b.blocks[tuple(k + j)]
+                    dot_grid_args = a._compute_tensordot_syskwargs(a_block, b_block)
+                    dotted_oid = a.km.sparse_bop(
                         "tensordot",
-                        self_block.oid,
-                        other_block.oid,
-                        self_block.transposed,
-                        other_block.transposed,
+                        a_block.oid,
+                        b_block.oid,
+                        a_block.transposed,
+                        b_block.transposed,
                         axes=axes,
                         densify=densify,
                         syskwargs={
@@ -651,81 +766,117 @@ def tensordot(self, other, axes=2):
                     sum_oids.append(
                         (dotted_oid, dot_grid_args[0], dot_grid_args[1], False)
                     )
-                result_block.oid = self.tree_reduce(
-                    "sum", sum_oids, result_block.grid_entry, result_block.grid_shape
+                result_block.oid = a.tree_reduce(
+                    "sum",
+                    sum_oids,
+                    result_block.grid_entry,
+                    result_block.grid_shape,
+                    densify,
                 )
                 if not densify:
-                    result_block._nnz = self.km.sparse_nnz(
-                        result_block.oid,
-                        syskwargs={
-                            "grid_entry": result_block.grid_entry,
-                            "grid_shape": result_block.grid_shape,
-                        },
+                    syskwargs = {
+                        "grid_entry": result_block.grid_entry,
+                        "grid_shape": result_block.grid_shape,
+                    }
+                    result_block._nnz = a.km.sparse_nnz(
+                        result_block.oid, syskwargs=syskwargs
+                    )
+                    result_block._nbytes = a.km.sparse_nbytes(
+                        result_block.oid, syskwargs=syskwargs
                     )
         return result
 
-    def __add__(self, other):
-        return self.__elementwise__("add", other)
-
-    def __radd__(self, other):
-        return self.__elementwise__("add", other)
-
-    __iadd__ = __add__
-
-    def __sub__(self, other):
-        return self.__elementwise__("sub", other)
-
-    def __rsub__(self, other):
-        # FIXME: not commutative
-        return self.__elementwise__("sub", other)
-
-    __isub__ = __sub__
-
-    def __mul__(self, other):
-        return self.__elementwise__("mul", other)
-
-    def __rmul__(self, other):
-        return self.__elementwise__("mul", other)
-
-    __imul__ = __mul__
-
-    def __truediv__(self, other):
-        return self.__elementwise__("truediv", other)
-
-    def __rtruediv__(self, other):
-        return self.__elementwise__("truediv", other)
-
-    __itruediv__ = __truediv__
-
-    def __floordiv__(self, other):
-        return self.__elementwise__("floordiv", other)
-
-    def __rfloordiv__(self, other):
-        return self.__elementwise__("floordiv", other)
+    def sdtp(self, *block_arrays: List[BlockArray]):
+        """
+        Perform a sampled tensor product among an arbitrary number of block arrays.
+        """
+        for i, ba in enumerate(block_arrays):
+            assert len(ba.shape) == 1
+            assert ba.shape[0] == self.shape[i]
+            assert ba.block_shape[0] == self.block_shape[i]
+        # Sparsity of result is same as self.
+        result: SparseBlockArray = SparseBlockArray(self.grid, self.km)
+        for grid_entry in self.grid.get_entry_iterator():
+            dense_oids = [
+                ba.blocks[grid_entry[i]].oid for i, ba in enumerate(block_arrays)
+            ]
+            result.blocks[grid_entry].oid = self.km.sdtp(
+                self.blocks[grid_entry].oid,
+                *dense_oids,
+                syskwargs={"grid_entry": grid_entry, "grid_shape": result.grid_shape},
+            )
+        return result
 
-    __ifloordiv__ = __floordiv__
+    def sdtd(self, x: BlockArray, y: BlockArray, axes: int):
+        """
+        Perform a sampled dense-dense tensor dot between two tensors x and y.
+        In addition to several compatibility-related constraints to ensure
+        the operation is actually valid, we also require that the fill value of
+        the sparse array is 0, and that the axes over which the tensor dot
+        is being performed are not partitioned.
+        The last constraint can be eliminated if we add a sampled element-wise kernel.
+        """
+        if array_utils.np_tensordot_param_test(x.shape, x.ndim, y.shape, y.ndim, axes):
+            raise ValueError("shape-mismatch for sum")
 
-    def __pow__(self, other):
-        return self.__elementwise__("pow", other)
+        x_axes = x.grid.grid_shape[:-axes]
+        x_sum_axes = x.grid.grid_shape[-axes:]
+        y_axes = y.grid.grid_shape[axes:]
+        y_sum_axes = y.grid.grid_shape[:axes]
+        assert x_sum_axes == y_sum_axes
+        result_shape = tuple(x.shape[:-axes] + y.shape[axes:])
+        result_block_shape = tuple(x.block_shape[:-axes] + y.block_shape[axes:])
+        result_grid = ArrayGrid(
+            shape=result_shape,
+            block_shape=result_block_shape,
+            dtype=array_utils.get_bop_output_type(
+                "tensordot", x.dtype, y.dtype
+            ).__name__,
+        )
+        # Ensure dims over which we sum are not partitioned.
+        # This is currently required for scalable sampled dense.
+        assert np.sum(x_sum_axes) == np.sum(y_sum_axes) == axes
 
-    def __rpow__(self, other):
-        return self.__elementwise__("pow", other)
+        assert result_grid.grid_shape == tuple(x_axes + y_axes)
+        assert result_grid.grid_shape == self.grid_shape
+        assert result_grid.block_shape == self.block_shape
+        result: SparseBlockArray = SparseBlockArray(self.grid, self.km)
+        x_dims = list(itertools.product(*map(range, x_axes)))
+        y_dims = list(itertools.product(*map(range, y_axes)))
+        sum_dims = tuple([0] * axes)
+        for i in x_dims:
+            for j in y_dims:
+                grid_entry = tuple(i + j)
+                x_block: Block = x.blocks[tuple(i + sum_dims)]
+                y_block: Block = y.blocks[tuple(sum_dims + j)]
+                result.blocks[grid_entry].oid = self.km.sdtd(
+                    self.blocks[grid_entry].oid,
+                    x_block.oid,
+                    y_block.oid,
+                    axes=axes,
+                    syskwargs={
+                        "grid_entry": grid_entry,
+                        "grid_shape": result.grid_shape,
+                    },
+                )
+        return result
 
-    __ipow__ = __pow__
+    #################
+    # Inequalities
+    #################
 
     def __inequality__(self, op_name, other):
         other = self.check_or_convert_other(other)
-        assert other.shape == (), "Currently supports comparison with scalars only."
+        assert (
+            other.shape == () or other.shape == self.shape
+        ), "Currently supports comparison with scalars only."
         shape = array_utils.broadcast(self.shape, other.shape).shape
         block_shape = array_utils.broadcast_block_shape(
             self.shape, other.shape, self.block_shape
         )
         dtype = bool.__name__
         grid = ArrayGrid(shape, block_shape, dtype)
-        fill_value = array_utils.get_bop_fill_value(
-            op_name, self.fill_value, other.fill_value
-        )
-        result = SparseBlockArray(grid, self.km, fill_value)
+        result = SparseBlockArray(grid, self.km)
         for grid_entry in result.grid.get_entry_iterator():
             other_block: Block = other.blocks.item()
             result.blocks[grid_entry] = self.blocks[grid_entry].bop(
diff --git a/nums/core/array/utils.py b/nums/core/array/utils.py
index 8cefd766..356b78f7 100644
--- a/nums/core/array/utils.py
+++ b/nums/core/array/utils.py
@@ -18,12 +18,9 @@
 
 import numpy as np
 import scipy.special
-import sparse
 
 from nums.core.settings import np_ufunc_map
 from nums.core.array.errors import AxisError
-from nums.core.array.base import Block
-from nums.core.array.sparse import SparseBlock
 
 # pylint: disable = no-member, trailing-whitespace
 
@@ -455,62 +452,25 @@ def normalize_axis_index(axis, ndim):
     return axis % ndim
 
 
-# def get_sparse_bop_return_type(op_name, a: Block, b: Block):
-#     def sample_array(block):
-#         s = np.eye(2)
-#         if isinstance(block, SparseBlock):
-#             return sparse.COO.from_numpy(s, fill_value=block.fill_value)
-#         return s
-
-#     sa = sample_array(a)
-#     sb = sample_array(b)
-#     if op_name == "tensordot":
-#         result = sparse.tensordot(sa, sb)
-#     else:
-#         op_name = np_ufunc_map.get(op_name, op_name)
-#         try:
-#             ufunc = np.__getattribute__(op_name)
-#         except Exception as _:
-#             ufunc = scipy.special.__getattribute__(op_name)
-#         result = sparse.elemwise(ufunc, sa, sb)
-#     if isinstance(result, sparse.SparseArray):
-#         return False
-#     return True
-
-
-def get_sparse_bop_return_type(op_name, a_fv, b_fv):
-    def sample_array(fv):
-        s = np.eye(2)
-        if fv is not None:
-            return sparse.COO.from_numpy(s, fill_value=fv)
-        return s
-
-    sa = sample_array(a_fv)
-    sb = sample_array(b_fv)
-    if op_name == "tensordot":
-        result = sparse.tensordot(sa, sb)
-    else:
-        op_name = np_ufunc_map.get(op_name, op_name)
-        try:
-            ufunc = np.__getattribute__(op_name)
-        except Exception as _:
-            ufunc = scipy.special.__getattribute__(op_name)
-        result = sparse.elemwise(ufunc, sa, sb)
-    if isinstance(result, sparse.SparseArray):
+def get_sparse_uop_densify(op_name):
+    ufunc = np.__getattribute__(op_name)
+    if ufunc(0) == 0:
         return False
     return True
 
 
-def get_bop_fill_value(op_name, a_fv, b_fv):
-    if a_fv is None and b_fv is None:
-        return None
-    if a_fv is None:
-        return b_fv
-    if b_fv is None:
-        return a_fv
-    op_name = np_ufunc_map.get(op_name, op_name)
-    try:
-        func = np.__getattribute__(op_name)
-    except Exception as _:
-        func = scipy.special.__getattribute__(op_name)
-    return func(a_fv, b_fv)
+sd_bop_sparse_ops = [
+    "mul",
+]
+
+
+def get_sparse_bop_densify(op_name, a_dense: bool, b_dense: bool):
+    if a_dense and b_dense:
+        return True
+
+    if a_dense or b_dense:
+        if op_name in sd_bop_sparse_ops:
+            return False
+        return True
+
+    return False
diff --git a/nums/core/array/view.py b/nums/core/array/view.py
index a9186c9c..9fa40b7e 100644
--- a/nums/core/array/view.py
+++ b/nums/core/array/view.py
@@ -19,7 +19,7 @@
 
 from nums.core.array import selection
 from nums.core.array import utils as array_utils
-from nums.core.array.base import Block, BlockArrayBase
+from nums.core.array.base import BlockBase, BlockArrayBase
 from nums.core.array.selection import BasicSelection
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.core.grid.grid import ArrayGrid
@@ -39,6 +39,7 @@ def from_subscript(cls, bab, subscript):
     def __init__(self, source, sel: BasicSelection = None, block_shape: tuple = None):
         self._source: BlockArrayBase = source
         self._km: KernelManager = self._source.km
+        self._concrete_cls = type(source)
 
         if sel is None:
             sel = BasicSelection.from_shape(self._source.shape)
@@ -84,7 +85,7 @@ def basic_select(self, subscript: tuple):
         result: ArrayView = ArrayView(self._source, sel)
         return result
 
-    def create(self, concrete_cls=None) -> BlockArrayBase:
+    def create(self) -> BlockArrayBase:
         if self.sel.basic_steps():
             if self.sel.is_aligned(self._source.block_shape):
                 # Assertion below should form a conjunction with the above condition.
@@ -95,20 +96,21 @@ def create(self, concrete_cls=None) -> BlockArrayBase:
                     self.sel.get_broadcastable_block_shape(self.block_shape),
                     self._source.block_shape,
                 )
-                return self.create_references(concrete_cls)
+                return self.create_references(self._concrete_cls)
             else:
-                return self.create_basic_single_step(concrete_cls)
+                return self.create_basic_single_step(self._concrete_cls)
         else:
-            return self.create_basic_multi_step(concrete_cls)
+            return self.create_basic_multi_step(self._concrete_cls)
 
     def create_references(self, concrete_cls) -> BlockArrayBase:
         # TODO (hme): Double check this.
-        array_cls = BlockArrayBase if concrete_cls is None else concrete_cls
-        dst_ba: BlockArrayBase = array_cls(self.grid, self._km)
+        # array_cls = BlockArrayBase if concrete_cls is None else concrete_cls
+        # dst_ba: BlockArrayBase = array_cls(self.grid, self._km)
+        dst_ba: BlockArrayBase = concrete_cls(self.grid, self._km)
         if 0 in self.shape:
             return dst_ba
         grid_offset = self.sel.position().value // np.array(
-            self._source.block_shape, dtype=np.int
+            self._source.block_shape, dtype=np.intp
         )
         dst_inflated_shape = self.sel.get_broadcastable_shape()
         dst_inflated_block_shape = self.sel.get_broadcastable_block_shape(
@@ -123,7 +125,9 @@ def create_references(self, concrete_cls) -> BlockArrayBase:
         ):
             dst_grid_entry = dst_grid_entry_iterator[dst_index]
             src_grid_entry = tuple(
-                (np.array(dst_inflated_grid_entry, dtype=np.int) + grid_offset).tolist()
+                (
+                    np.array(dst_inflated_grid_entry, dtype=np.intp) + grid_offset
+                ).tolist()
             )
             dst_ba.blocks[dst_grid_entry].oid = self._source.blocks[src_grid_entry].oid
             dst_ba.blocks[dst_grid_entry].transposed = self._source.blocks[
@@ -171,7 +175,7 @@ def create_basic_single_step(self, concrete_cls) -> BlockArrayBase:
                 ]
                 if src_dst_intersection_block.is_empty():
                     continue
-                src_block: Block = self._source.blocks[src_grid_entry]
+                src_block: BlockBase = self._source.blocks[src_grid_entry]
                 src_oids.append(src_block.oid)
                 src_sel_block: BasicSelection = src_sel_arr[src_grid_entry]
                 src_dep_sel_loc = src_dst_intersection_block - src_sel_block.position()
@@ -180,7 +184,7 @@ def create_basic_single_step(self, concrete_cls) -> BlockArrayBase:
                     src_dst_intersection_block - dst_sel_offset_block.position()
                 )
                 dst_params.append((dst_block_sel_loc.selector(), False))
-            dst_block: Block = dst_ba.blocks.reshape(dst_grid_bc.grid_shape)[
+            dst_block: BlockBase = dst_ba.blocks.reshape(dst_grid_bc.grid_shape)[
                 dst_grid_entry_bc
             ]
             dst_block.oid = km.create_block(
@@ -261,7 +265,7 @@ def assign_references(self, dst_sel: BasicSelection, value):
         # but the destination selection may not have the same shape as value.
         # May need to broadcast value to destination selection output shape.
         dst_offset = dst_sel.position().value // np.array(
-            self._source.block_shape, dtype=np.int
+            self._source.block_shape, dtype=np.intp
         )
         # Do we need to broadcast?
         if isinstance(value, ArrayView) and (
@@ -273,7 +277,7 @@ def assign_references(self, dst_sel: BasicSelection, value):
             # We don't need to create value to perform the reference copy.
             # No broadcasting required, so this should be okay.
             src_offset = value.sel.position().value // np.array(
-                value._source.block_shape, dtype=np.int
+                value._source.block_shape, dtype=np.intp
             )
             src_inflated_shape = dst_sel.get_broadcastable_shape()
             src_inflated_block_shape = dst_sel.get_broadcastable_block_shape(
@@ -286,18 +290,18 @@ def assign_references(self, dst_sel: BasicSelection, value):
                 # Num axes in value grid may be too small.
                 dst_grid_entry = tuple(
                     (
-                        np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset
+                        np.array(src_grid_entry_inflated, dtype=np.intp) + dst_offset
                     ).tolist()
                 )
                 src_grid_entry = tuple(
                     (
-                        np.array(src_grid_entry_inflated, dtype=np.int) + src_offset
+                        np.array(src_grid_entry_inflated, dtype=np.intp) + src_offset
                     ).tolist()
                 )
                 # This is a reference assignment, and the grid properties between the
                 # two blocks may differ, so retain those properties in the copy.
-                dst_block: Block = self._source.blocks[dst_grid_entry]
-                src_block_copy: Block = value._source.blocks[src_grid_entry].copy()
+                dst_block: BlockBase = self._source.blocks[dst_grid_entry]
+                src_block_copy: BlockBase = value._source.blocks[src_grid_entry].copy()
                 src_block_copy.grid_entry = dst_block.grid_entry
                 src_block_copy.grid_shape = dst_block.grid_shape
                 self._source.blocks[dst_grid_entry] = src_block_copy
@@ -322,13 +326,13 @@ def assign_references(self, dst_sel: BasicSelection, value):
                 src_grid_entry = src_grid_entry_iterator[src_index]
                 dst_grid_entry = tuple(
                     (
-                        np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset
+                        np.array(src_grid_entry_inflated, dtype=np.intp) + dst_offset
                     ).tolist()
                 )
                 # This is a reference assignment, and the grid properties between the
                 # two blocks may differ, so retain those properties in the copy.
-                dst_block: Block = self._source.blocks[dst_grid_entry]
-                src_block_copy: Block = src_ba.blocks[src_grid_entry].copy()
+                dst_block: BlockBase = self._source.blocks[dst_grid_entry]
+                src_block_copy: BlockBase = src_ba.blocks[src_grid_entry].copy()
                 src_block_copy.grid_entry = dst_block.grid_entry
                 src_block_copy.grid_shape = dst_block.grid_shape
                 self._source.blocks[dst_grid_entry] = src_block_copy
@@ -389,7 +393,7 @@ def basic_assign_single_step(self, dst_sel: BasicSelection, value):
             src_oids = []
             src_params = []
             dst_params = []
-            dst_block: Block = dst_ba.blocks[dst_grid_entry]
+            dst_block: BlockBase = dst_ba.blocks[dst_grid_entry]
             for src_index, src_grid_entry_bc in enumerate(
                 src_inflated_grid.get_entry_iterator()
             ):
@@ -400,7 +404,7 @@ def basic_assign_single_step(self, dst_sel: BasicSelection, value):
                     continue
 
                 src_grid_entry = src_grid_entry_iterator[src_index]
-                src_block: Block = src_ba_bc.blocks[src_grid_entry]
+                src_block: BlockBase = src_ba_bc.blocks[src_grid_entry]
                 src_oids.append(src_block.oid)
 
                 src_sel_block_offset: BasicSelection = src_sel_offset[src_grid_entry_bc]
diff --git a/nums/core/kernel/kernel_interface.py b/nums/core/kernel/kernel_interface.py
index 04bd194d..0314011c 100644
--- a/nums/core/kernel/kernel_interface.py
+++ b/nums/core/kernel/kernel_interface.py
@@ -167,7 +167,7 @@ def any(self, arr, syskwargs: Dict):
 
     # Sparse
 
-    def dense_to_sparse(self, arr, fill_value, syskwargs: Dict):
+    def dense_to_sparse(self, arr, syskwargs: Dict):
         raise NotImplementedError()
 
     def sparse_to_dense(self, arr, syskwargs: Dict):
@@ -179,6 +179,9 @@ def sparse_nnz(self, arr, syskwargs: Dict):
     def sparse_nbytes(self, arr, syskwargs: Dict):
         raise NotImplementedError()
 
+    def new_sparse_block(self, op_name, grid_entry, grid_meta, syskwargs: Dict):
+        raise NotImplementedError()
+
     def sparse_random_block(
         self,
         rng_params,
@@ -187,17 +190,27 @@ def sparse_random_block(
         shape,
         dtype,
         p,
-        fill_value,
         syskwargs: Dict,
     ):
         raise NotImplementedError()
 
-    def sparse_map_uop(self, op_name, arr, args, kwargs, syskwargs: Dict):
+    def sparse_map_uop(self, op_name, arr, args, kwargs, densify, syskwargs: Dict):
         raise NotImplementedError()
 
     def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify, syskwargs: Dict):
         raise NotImplementedError()
 
+    def sparse_reduce_axis(
+        self, op_name, arr, axis, keepdims, transposed, syskwargs: Dict
+    ):
+        raise NotImplementedError()
+
+    def sparse_bop_reduce(self, op, a1, a2, a1_T, a2_T, syskwargs: Dict):
+        raise NotImplementedError()
+
+    def sparse_block_from_scalar(self, x, syskwargs: Dict):
+        raise NotImplementedError()
+
     def sdtp(self, s, *dense_arrays, syskwargs: Dict):
         raise NotImplementedError()
 
diff --git a/nums/core/kernel/numpy_kernel.py b/nums/core/kernel/numpy_kernel.py
index ead191c7..1789622f 100644
--- a/nums/core/kernel/numpy_kernel.py
+++ b/nums/core/kernel/numpy_kernel.py
@@ -526,9 +526,8 @@ def any(self, arr):
 
     # Sparse
 
-    def dense_to_sparse(self, arr, fill_value):
-        result = COO.from_numpy(arr, fill_value=fill_value)
-        return result
+    def dense_to_sparse(self, arr):
+        return COO.from_numpy(arr, fill_value=0)
 
     def sparse_to_dense(self, arr):
         return arr.todense()
@@ -539,6 +538,16 @@ def sparse_nnz(self, arr):
     def sparse_nbytes(self, arr):
         return arr.nbytes
 
+    def new_sparse_block(self, op_name, grid_entry, grid_meta):
+        op_func = sparse.__getattribute__(op_name)
+        grid = ArrayGrid.from_meta(grid_meta)
+        block_shape = grid.get_block_shape(grid_entry)
+        if op_name == "eye":
+            assert np.all(np.diff(grid_entry) == 0)
+            return op_func(*block_shape, dtype=grid.dtype)
+        else:
+            return op_func(block_shape, dtype=grid.dtype)
+
     def sparse_random_block(
         self,
         rng_params,
@@ -547,7 +556,6 @@ def sparse_random_block(
         shape,
         dtype,
         p,
-        fill_value,
     ):
         rng = block_rng_legacy(*rng_params)
         rfunc = rng.__getattribute__(rfunc_name)
@@ -559,14 +567,11 @@ def sparse_random_block(
             random_state=rng,
             data_rvs=lambda s: rfunc(**rfunc_args, size=s),
             format="coo",
-            fill_value=fill_value,
+            fill_value=0,
         )
-        if rfunc_name != "randint":
-            # Only random and integer supports sampling of a specific type.
-            result = result.astype(dtype)
-        return result
+        return result.astype(dtype)
 
-    def sparse_map_uop(self, op_name, arr, args, kwargs):
+    def sparse_map_uop(self, op_name, arr, args, kwargs, densify):
         """
         Args:
             func: types.Callable
@@ -577,6 +582,10 @@ def sparse_map_uop(self, op_name, arr, args, kwargs):
         args = list(args)
         args.insert(0, arr)
         result = sparse.elemwise(ufunc, *args, **kwargs)
+        if densify and isinstance(result, sparse.SparseArray):
+            result = result.todense()
+        elif not densify:
+            assert isinstance(result, sparse.SparseArray)
         return result
 
     def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify):
@@ -605,12 +614,45 @@ def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify):
             assert isinstance(result, sparse.SparseArray)
         return result
 
+    def sparse_reduce_axis(self, op_name, arr, axis, keepdims, transposed):
+        assert isinstance(arr, sparse.COO)
+        op_func = np.__getattribute__(op_name)
+        if transposed:
+            arr = arr.T
+        return arr.reduce(op_func, axis=axis, keepdims=keepdims)
+
+    def sparse_bop_reduce(self, op, a1, a2, a1_T, a2_T):
+        assert isinstance(a1, sparse.COO) and isinstance(a2, sparse.COO)
+        if a1_T:
+            a1 = a1.T
+        if a2_T:
+            a2 = a2.T
+
+        # These are faster.
+        if op == "sum":
+            r = a1 + a2
+        elif op == "prod":
+            r = a1 * a2
+        else:
+            a = sparse.stack([a1, a2], axis=0)
+            r = a.reduce(np.__getattribute__(op), axis=0, keepdims=False)
+
+        if a1 is np.nan or a2 is np.nan or r is np.nan:
+            assert np.isscalar(a1) and np.isscalar(a2) and np.isscalar(r)
+        else:
+            assert a1.shape == a2.shape == r.shape
+        return r
+
+    def sparse_block_from_scalar(self, x):
+        assert np.isscalar(x)
+        return sparse.COO.from_numpy(np.array(x), fill_value=0)
+
     def sdtp(self, s: sparse.COO, *dense_arrays):
         data = np.copy(s.data)
         for position in range(s.nnz):
             for axis in range(len(s.shape)):
                 data[position] *= dense_arrays[axis][s.coords[axis][position]]
-        return sparse.COO(s.coords, data, shape=s.shape, fill_value=s.fill_value)
+        return sparse.COO(s.coords, data, shape=s.shape, fill_value=0)
 
     def sdtd(self, s: sparse.COO, x: np.ndarray, y: np.ndarray, axes: int):
         # Check some things.
@@ -635,4 +677,4 @@ def sdtd(self, s: sparse.COO, x: np.ndarray, y: np.ndarray, axes: int):
             sx = x[tuple(x_coords)]
             sy = y[tuple(y_coords)]
             data[position] *= np.tensordot(sx, sy, axes=axes)
-        return sparse.COO(s.coords, data, shape=s.shape, fill_value=s.fill_value)
+        return sparse.COO(s.coords, data, shape=s.shape, fill_value=0)
diff --git a/nums/experimental/optimizer/fusion.py b/nums/experimental/optimizer/fusion.py
index e20b04e7..40373351 100644
--- a/nums/experimental/optimizer/fusion.py
+++ b/nums/experimental/optimizer/fusion.py
@@ -5,7 +5,6 @@
 import numpy as np
 
 from nums.core.array import utils as array_utils
-from nums.core.array.base import Block
 from nums.core.grid.grid import Device
 from nums.experimental.optimizer.clusterstate import ClusterState
 
@@ -49,7 +48,7 @@ def traverse(self, node: TreeNode, fuseable_nodes):
             )
             if child_leafs is None:
                 # This branch has been pruned.
-                return None, None, None
+                return None, None, 0
             if len(child_block_id_set) <= self.max_args:
                 # If it's a frontier node, then there's no point in fusing it.
                 # If it's a leaf, we still want to count toward node_leafs,
@@ -103,11 +102,15 @@ def update_graph(self, root, fuseable_nodes):
 
     def fuse_node(self, node: TreeNode):
         result = FunctionNode(node.cluster_state)
-        result.op_func, result.children = node.fuse(result, self.km)
+        result.op_func, result.children, result.tree_node_meta = node.fuse(
+            result, self.km
+        )
         result.set_shape(node.shape())
         result.set_grid_entry(node.grid_entry())
         result.set_grid_shape(node.grid_shape())
         result.set_dtype(node.dtype())
+        assert result.tree_node_meta.shape == result.shape()
+        assert result.tree_node_meta.dtype == result.dtype()
 
         expression_str = node.expression()
         result.set_expression(expression_str)
diff --git a/nums/experimental/optimizer/fusion_utils.py b/nums/experimental/optimizer/fusion_utils.py
index cc4e8134..c47aedd1 100644
--- a/nums/experimental/optimizer/fusion_utils.py
+++ b/nums/experimental/optimizer/fusion_utils.py
@@ -40,3 +40,21 @@ def set_using_marker(node, inputs):
         node.update_child([child], [inputs[child.marker]])
         inputs[child.marker].parent = node
     return node
+
+
+def print_graph(root: TreeNode):
+    """
+    Recursively print string representations of tree nodes using BFS.
+    """
+    print("print_graph")
+    nodes = [root]
+    while len(nodes) > 0:
+        node = nodes.pop(0)
+        if hasattr(node, "child"):
+            nodes.append(node.child)
+        if hasattr(node, "left"):
+            nodes.append(node.left)
+            nodes.append(node.right)
+        if hasattr(node, "children"):
+            nodes.extend(node.children)
+        print(node)
diff --git a/nums/experimental/optimizer/graph.py b/nums/experimental/optimizer/graph.py
index 3ac3f3df..2ffc7a51 100644
--- a/nums/experimental/optimizer/graph.py
+++ b/nums/experimental/optimizer/graph.py
@@ -21,11 +21,12 @@
 
 from nums.core.settings import sync_nnz
 from nums.core.array import utils as array_utils
-from nums.core.array.base import Block
+from nums.core.array.base import BlockBase, Block
+from nums.core.array.sparse import SparseBlock
 from nums.core.grid.grid import Device
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.experimental.optimizer.clusterstate import ClusterState
-from nums.experimental.optimizer.size import TreeNodeSize
+from nums.experimental.optimizer.node_meta import TreeNodeMeta, LeafMeta
 
 
 def subsample(total_items, max_items, rs: np.random.RandomState):
@@ -52,6 +53,13 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None):
         self._dtype = None
         self._expression = None
 
+        # Type LeafMeta for leaves: shape, nnz, dtype, and whether output is_dense.
+        # Type TreeNodeMeta for other nodes: dtype and whether output is_dense.
+        self.tree_node_meta: TreeNodeMeta = None
+        # Whether this operation invokes dense or sparse kernel.
+        # Need to explicitly set is_dense property after initialization.
+        self.dense_kernel = True
+
     def get_root(self):
         if self.parent is None:
             return self
@@ -123,6 +131,12 @@ def make_bop(self, op_name, other, args=None):
             assert self.parent is None and other.parent is None
             bop.left, bop.right = self, other
             bop.left.parent, bop.right.parent = bop, bop
+        bop.tree_node_meta = bop.left.tree_node_meta.bop_partial(
+            op_name, bop.right.tree_node_meta
+        )
+        bop.dense_kernel = (
+            bop.left.tree_node_meta.is_dense and bop.right.tree_node_meta.is_dense
+        )
         return bop
 
     def tensordot(self, other, axes):
@@ -154,7 +168,6 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None):
         super().__init__(cluster_state, tree_node_id)
         self.block = None
         self.marker = -1
-        self.tree_node_size = None
 
     def get_children(self):
         return []
@@ -186,8 +199,8 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         # This property is only used for fusion.
         leaf.marker = self.marker
 
-        # This property is for experimental output size estimation.
-        leaf.tree_node_size = self.tree_node_size
+        leaf.tree_node_meta = self.tree_node_meta
+        leaf.dense_kernel = self.dense_kernel
         return leaf
 
     def get_leafs(self):
@@ -229,7 +242,7 @@ def expression(self):
 
     def fuse(self, func_node, km: KernelManager):
         f = km.get_fuseable("identity")
-        return f, [self.copy(self.cluster_state, func_node)]
+        return f, [self.copy(self.cluster_state, func_node)], self.tree_node_meta
 
     def is_scalar(self):
         return self.block.size() == 1
@@ -242,10 +255,11 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None):
         self.op_name = None
 
     def __repr__(self):
-        return "UnaryOp(name=%s, id=%s, child=%s)" % (
+        return "UnaryOp(name=%s, id=%s, child=%s, dense_kernel=%s)" % (
             self.op_name,
             str(self.tree_node_id),
             str(self.child.tree_node_id),
+            self.dense_kernel,
         )
 
     def get_children(self):
@@ -264,6 +278,9 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         uop.child = self.child.copy(cluster_state, parent=uop, new_ids=new_ids)
         uop.op_name = self.op_name
         uop.copy_on_op = self.copy_on_op
+
+        uop.tree_node_meta = self.tree_node_meta
+        uop.dense_kernel = self.dense_kernel
         return uop
 
     def update_child(self, old_children, new_children):
@@ -313,11 +330,11 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         assert isinstance(self.child, Leaf)
         result = self._collapse(device)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
         self.cluster_state.commit_uop(self._mem_cost(), self.child.block.id, device)
         # self.cluster_state.add_block(new_block.id, new_block.size(), [device])
         self.cluster_state.add_block(
-            new_block.id, new_leaf.tree_node_size.nbytes, [device]
+            new_block.id, new_leaf.tree_node_meta.nbytes, [device]
         )
         if not self.cluster_state.created_on_only:
             assert self.cluster_state.blocks_local(
@@ -330,26 +347,28 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
 
     def _collapse(self, device: Device):
         assert isinstance(self.child, Leaf)
-        block: Block = self.child.block
+        block: BlockBase = self.child.block
         op_name, args = self.op_name, {}
         if op_name == "transpose":
-            block: Block = block.transpose(defer=True)
+            block: BlockBase = block.transpose(defer=True)
         else:
-            block: Block = block.ufunc(op_name, device=device)
+            block: BlockBase = block.ufunc(op_name, device=device)
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
-        leaf.tree_node_size = self.child.tree_node_size.uop(op_name)
+        assert isinstance(self.child.tree_node_meta, LeafMeta)
+        leaf.tree_node_meta = self.child.tree_node_meta.uop(op_name)
         leaf.copy_on_op = self.copy_on_op
         return leaf, block
 
     def _mem_cost(self):
         assert isinstance(self.child, Leaf)
-        block: Block = self.child.block
+        block: BlockBase = self.child.block
         if block.is_dense:
             return np.product(block.shape)
         if sync_nnz > 1:
-            self.child.tree_node_size.nnz = block.nnz  # Blocking fetch
-        return self.child.tree_node_size.uop(self.op_name).nbytes
+            self.child.tree_node_meta.nnz = block.nnz  # Blocking fetch
+        assert isinstance(self.child.tree_node_meta, LeafMeta)
+        return self.child.tree_node_meta.uop(self.op_name).nbytes
 
     def shape(self):
         if self._shape is None:
@@ -387,24 +406,35 @@ def dtype(self):
 
     def expression(self):
         if self._expression is None:
-            self._expression = "UnaryOp(op=%s, x=%s)" % (
+            self._expression = "UnaryOp(op=%s, dense_kernel=%s, x=%s)" % (
                 self.op_name,
+                self.dense_kernel,
                 self.child.expression(),
             )
         return self._expression
 
     def fuse(self, func_node, km: KernelManager):
-        child_op, child_args = self.child.fuse(func_node, km)
+        child_op, child_args, child_meta = self.child.fuse(func_node, km)
         if self.op_name == "transpose":
             self_op = km.get_fuseable("transpose")
         else:
-            self_op = km.get_fuseable("map_uop")
-            self_op = partial(self_op, op_name=self.op_name, args=(), kwargs={})
+            if self.dense_kernel:
+                self_op = km.get_fuseable("map_uop")
+                self_op = partial(self_op, op_name=self.op_name, args=(), kwargs={})
+            else:
+                self_op = km.get_fuseable("sparse_map_uop")
+                self_op = partial(
+                    self_op,
+                    op_name=self.op_name,
+                    args=(),
+                    kwargs={},
+                    densify=self.tree_node_meta.is_dense,
+                )
 
         def fused(*args):
             return self_op(arr=child_op(*args))
 
-        return fused, child_args
+        return fused, child_args, child_meta.uop(self.op_name)
 
 
 class ReduceAxis(UnaryOp):
@@ -430,53 +460,66 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         ra.axis = self.axis
         ra.keepdims = self.keepdims
         ra.copy_on_op = self.copy_on_op
+
+        ra.tree_node_meta = self.tree_node_meta
+        ra.dense_kernel = self.dense_kernel
         return ra
 
     def _collapse(self, device: Device):
         assert isinstance(self.child, Leaf)
-        child_block: Block = self.child.block
+        child_block: BlockBase = self.child.block
         op_name, args = self.op_name, {}
 
-        block = Block(
+        block: BlockBase = Block(
             grid_entry=self.grid_entry(),
             grid_shape=self.grid_shape(),
             shape=self.shape(),
             dtype=self.dtype(),
             transposed=False,
-            km=child_block._km,
-        )
-        block.oid = child_block._km.reduce_axis(
-            op_name=op_name,
-            arr=child_block.oid,
-            axis=self.axis,
-            keepdims=self.keepdims,
-            transposed=child_block.transposed,
-            syskwargs={"device": device},
+            km=child_block.km,
         )
+        if self.dense_kernel:
+            block.oid = child_block.km.reduce_axis(
+                op_name=op_name,
+                arr=child_block.oid,
+                axis=self.axis,
+                keepdims=self.keepdims,
+                transposed=child_block.transposed,
+                syskwargs={"device": device},
+            )
+        else:
+            block.oid = child_block.km.sparse_reduce_axis(
+                op_name=op_name,
+                arr=child_block.oid,
+                axis=self.axis,
+                keepdims=self.keepdims,
+                transposed=child_block.transposed,
+                syskwargs={"device": device},
+            )
         block._device = device
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
-        leaf.tree_node_size = self.child.tree_node_size.reduce_axis(
+        assert isinstance(self.child.tree_node_meta, LeafMeta)
+        leaf.tree_node_meta = self.child.tree_node_meta.reduce_axis(
             self.op_name,
             self.axis,
             self.keepdims,
-            self.child.block.transposed,
         )
         leaf.copy_on_op = self.copy_on_op
         return leaf, block
 
     def _mem_cost(self):
         assert isinstance(self.child, Leaf)
-        block: Block = self.child.block
+        block: BlockBase = self.child.block
         if block.is_dense:
             return np.product(self.shape())
         if sync_nnz > 1:
-            self.child.tree_node_size.nnz = block.nnz  # Blocking fetch
-        return self.child.tree_node_size.reduce_axis(
+            self.child.tree_node_meta.nnz = block.nnz  # Blocking fetch
+        assert isinstance(self.child.tree_node_meta, LeafMeta)
+        return self.child.tree_node_meta.reduce_axis(
             self.op_name,
             self.axis,
             self.keepdims,
-            block.transposed,
         ).nbytes
 
     def update_tuple_property(self, val, keep_dim_val: Union[int, tuple] = 1):
@@ -519,18 +562,25 @@ def dtype(self):
 
     def expression(self):
         if self._expression is None:
-            self._expression = "ReduceAxis(op=%s, x=%s, axis=%s, keepdims=%s)" % (
-                self.op_name,
-                self.child.expression(),
-                str(self.axis),
-                str(self.keepdims),
+            self._expression = (
+                "ReduceAxis(op=%s, axis=%s, keepdims=%s, dense_kernel=%s, x=%s)"
+                % (
+                    self.op_name,
+                    str(self.axis),
+                    str(self.keepdims),
+                    self.dense_kernel,
+                    self.child.expression(),
+                )
             )
         return self._expression
 
     def fuse(self, func_node, km: KernelManager):
-        child_op, child_args = self.child.fuse(func_node, km)
+        child_op, child_args, child_meta = self.child.fuse(func_node, km)
 
-        self_op = km.get_fuseable("reduce_axis")
+        if self.dense_kernel:
+            self_op = km.get_fuseable("reduce_axis")
+        else:
+            self_op = km.get_fuseable("sparse_reduce_axis")
         kwargs = {
             "op_name": self.op_name,
             "axis": self.axis,
@@ -543,7 +593,11 @@ def fuse(self, func_node, km: KernelManager):
         def fused(*args):
             return self_op(arr=child_op(*args))
 
-        return fused, child_args
+        return (
+            fused,
+            child_args,
+            child_meta.reduce_axis(self.op_name, self.axis, self.keepdims),
+        )
 
 
 class BinaryOp(TreeNode):
@@ -563,11 +617,12 @@ def __repr__(self):
             "matmul": "@",
             "tensordot": "@",
         }[self.op_name]
-        return "BOp(id=%s, op=%s%s%s)" % (
+        return "BOp(id=%s, op=%s%s%s, dense_kernel=%s)" % (
             self.tree_node_id,
             str(self.left.tree_node_id),
             bop_symbol,
             str(self.right.tree_node_id),
+            self.dense_kernel,
         )
 
     def get_children(self):
@@ -591,6 +646,9 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         bop.left = self.left.copy(cluster_state, bop, new_ids=new_ids)
         bop.right = self.right.copy(cluster_state, bop, new_ids=new_ids)
         bop.copy_on_op = self.copy_on_op
+
+        bop.tree_node_meta = self.tree_node_meta
+        bop.dense_kernel = self.dense_kernel
         return bop
 
     def update_child(self, old_children, new_children):
@@ -661,7 +719,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         assert isinstance(self.left, Leaf) and isinstance(self.right, Leaf)
         result = self._collapse(device)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
         # This updates load on nodes and channels.
         # This also updates block states to indicate that they now reside on the provided nodes.
         # Update the cluster state after computing the leaf, so that transfer costs are properly
@@ -672,7 +730,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         # Update cluster state with new block.
         # self.cluster_state.add_block(new_block.id, new_block.size(), [device])
         self.cluster_state.add_block(
-            new_block.id, new_leaf.tree_node_size.nbytes, [device]
+            new_block.id, new_leaf.tree_node_meta.nbytes, [device]
         )
         if not self.cluster_state.created_on_only:
             assert self.cluster_state.blocks_local(
@@ -690,8 +748,8 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
 
     def _collapse(self, device: Device):
         assert isinstance(self.left, Leaf) and isinstance(self.right, Leaf)
-        lblock: Block = self.left.block
-        rblock: Block = self.right.block
+        lblock: BlockBase = self.left.block
+        rblock: BlockBase = self.right.block
         if self.op_name == "matmul":
             op_name, args = "tensordot", {"axes": 1}
         elif self.op_name == "tensordot":
@@ -699,12 +757,14 @@ def _collapse(self, device: Device):
         else:
             op_name, args = self.op_name, {}
             assert array_utils.can_broadcast_shapes(lblock.shape, rblock.shape)
-        block: Block = lblock.bop(op_name, rblock, args=args, device=device)
+        block: BlockBase = lblock.bop(op_name, rblock, args=args, device=device)
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
-        leaf.tree_node_size = self.left.tree_node_size.bop(
+        assert isinstance(self.left.tree_node_meta, LeafMeta)
+        assert isinstance(self.right.tree_node_meta, LeafMeta)
+        leaf.tree_node_meta = self.left.tree_node_meta.bop(
             op_name,
-            self.right.tree_node_size,
+            self.right.tree_node_meta,
             **args,
         )
         leaf.copy_on_op = self.copy_on_op
@@ -714,8 +774,8 @@ def _mem_cost(self):
         # Computes the memory required to perform this operation.
         # We approximate by just computing the memory required to store the result.
         assert isinstance(self.left, Leaf) and isinstance(self.right, Leaf)
-        lblock: Block = self.left.block
-        rblock: Block = self.right.block
+        lblock: BlockBase = self.left.block
+        rblock: BlockBase = self.right.block
         if self.op_name == "matmul":
             op_name, args = "tensordot", {"axes": 1}
         elif self.op_name == "tensordot":
@@ -726,11 +786,13 @@ def _mem_cost(self):
         if lblock.is_dense and rblock.is_dense:
             return np.product(self.shape())
         if sync_nnz > 1:
-            self.left.tree_node_size.nnz = lblock.nnz  # Blocking fetch
-            self.right.tree_node_size.nnz = rblock.nnz  # Blocking fetch
-        return self.left.tree_node_size.bop(
+            self.left.tree_node_meta.nnz = lblock.nnz  # Blocking fetch
+            self.right.tree_node_meta.nnz = rblock.nnz  # Blocking fetch
+        assert isinstance(self.left.tree_node_meta, LeafMeta)
+        assert isinstance(self.right.tree_node_meta, LeafMeta)
+        return self.left.tree_node_meta.bop(
             op_name,
-            self.right.tree_node_size,
+            self.right.tree_node_meta,
             **args,
         ).nbytes
 
@@ -801,27 +863,44 @@ def expression(self):
         if self._expression is None:
             if self.op_name == "matmul" or self.op_name == "tensordot":
                 axes = self.args.get("axes", 1)
-                self._expression = "BinaryOp(op=%s, x=%s, y=%s, axes=%s)" % (
-                    self.op_name,
-                    self.left.expression(),
-                    self.right.expression(),
-                    axes,
+                self._expression = (
+                    "BinaryOp(op=%s, axes=%s, dense_kernel=%s, x=%s, y=%s)"
+                    % (
+                        self.op_name,
+                        axes,
+                        self.dense_kernel,
+                        self.left.expression(),
+                        self.right.expression(),
+                    )
                 )
-            self._expression = "BinaryOp(op=%s, x=%s, y=%s)" % (
+            self._expression = "BinaryOp(op=%s, dense_kernel=%s, x=%s, y=%s)" % (
                 self.op_name,
+                self.dense_kernel,
                 self.left.expression(),
                 self.right.expression(),
             )
         return self._expression
 
     def fuse(self, func_node, km: KernelManager):
-        left_op, left_args = self.left.fuse(func_node, km)
-        right_op, right_args = self.right.fuse(func_node, km)
-
-        self_op = km.get_fuseable("bop")
+        left_op, left_args, left_meta = self.left.fuse(func_node, km)
+        right_op, right_args, right_meta = self.right.fuse(func_node, km)
 
         axes = 1 if self.args is None else self.args.get("axes", 1)
-        self_op = partial(self_op, op=self.op_name, a1_T=False, a2_T=False, axes=axes)
+        if self.dense_kernel:
+            self_op = km.get_fuseable("bop")
+            self_op = partial(
+                self_op, op=self.op_name, a1_T=False, a2_T=False, axes=axes
+            )
+        else:
+            self_op = km.get_fuseable("sparse_bop")
+            self_op = partial(
+                self_op,
+                op=self.op_name,
+                a1_T=False,
+                a2_T=False,
+                axes=axes,
+                densify=self.tree_node_meta.is_dense,
+            )
         num_left = len(left_args)
         # Combine the left and right args.
         args = left_args + right_args
@@ -836,7 +915,21 @@ def fused(*args):
             args2 = args[num_left:]
             return self_op(a1=left_op(*args1), a2=right_op(*args2))
 
-        return fused, args
+        if self.op_name == "matmul":
+            op_name, extra_args = "tensordot", {"axes": 1}
+        elif self.op_name == "tensordot":
+            op_name, extra_args = "tensordot", self.args
+        else:
+            op_name, extra_args = self.op_name, {}
+        return (
+            fused,
+            args,
+            left_meta.bop(
+                op_name,
+                right_meta,
+                **extra_args,
+            ),
+        )
 
 
 class FunctionNode(TreeNode):
@@ -867,7 +960,7 @@ def finalize(self, km: KernelManager):
         km.register(self.op_hash, self.op_func, {})
 
     def __repr__(self):
-        return "Function(id=%s, op=%s, args=%s" % (
+        return "Function(id=%s, op=%s, args=%s)" % (
             self.tree_node_id,
             self.op_hash,
             len(self.children),
@@ -895,11 +988,13 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         fnode.parent = parent
         fnode.op_hash = self.op_hash
         fnode.op_func = self.op_func
-        fnode.op_expression = self.op_expression
         fnode.children = [
             child.copy(cluster_state, fnode, new_ids=new_ids) for child in self.children
         ]
         fnode.copy_on_op = self.copy_on_op
+
+        fnode.tree_node_meta = self.tree_node_meta
+        fnode.dense_kernel = self.dense_kernel
         return fnode
 
     def update_child(self, old_children, new_children):
@@ -981,7 +1076,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         block_ids = [child.block.id for child in self.children]
         result = self._collapse(device)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
         # This updates load on nodes and channels.
         # This also updates block states to indicate that they now reside on the provided nodes.
         # Update the cluster state after computing the leaf, so that transfer costs are properly
@@ -1006,16 +1101,21 @@ def _collapse(self, device: Device):
             assert isinstance(child, Leaf)
             block_oids.append(child.block.oid)
             if km is None:
-                km = child.block._km
-        block: Block = Block(
+                km = child.block.km
+        if self.tree_node_meta.is_dense:
+            block_type = Block
+        else:
+            block_type = SparseBlock
+        block: BlockBase = block_type(
             self._grid_entry, self._grid_shape, self._shape, self._dtype, False, km
         )
         block._device = device
-        block.oid = block._km.call(
+        block.oid = block.km.call(
             self.op_hash, *block_oids, syskwargs={"device": device}
         )
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
+        leaf.tree_node_meta = self.tree_node_meta
         leaf.copy_on_op = self.copy_on_op
         return leaf, block
 
@@ -1063,7 +1163,7 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None):
         self.children: List[TreeNode] = None
 
     def __repr__(self):
-        return "Einsum(id=%s, subscript=%s, operands=%s" % (
+        return "Einsum(id=%s, subscript=%s, operands=%s)" % (
             self.tree_node_id,
             self.subscript,
             len(self.children),
@@ -1175,7 +1275,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         block_ids = [child.block.id for child in self.children]
         result = self._collapse(device)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
         # This updates load on nodes and channels.
         # This also updates block states to indicate that they now reside on the provided nodes.
         # Update the cluster state after computing the leaf, so that transfer costs are properly
@@ -1184,7 +1284,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         # Update cluster state with new block.
         # self.cluster_state.add_block(new_block.id, new_block.size(), [device])
         self.cluster_state.add_block(
-            new_block.id, new_leaf.tree_node_size.nbytes, [device]
+            new_block.id, new_leaf.tree_node_meta.nbytes, [device]
         )
         if not self.cluster_state.created_on_only:
             for block_id in block_ids:
@@ -1203,19 +1303,22 @@ def _collapse(self, device: Device):
             assert isinstance(child, Leaf)
             block_oids.append(child.block.oid)
             if km is None:
-                km = child.block._km
-        block: Block = Block(
+                km = child.block.km
+        block: BlockBase = Block(
             self.grid_entry(), self.grid_shape(), self.shape(), self.dtype(), False, km
         )
         block._device = device
-        block.oid = block._km.einsum(
+        block.oid = block.km.einsum(
             self.subscript, *block_oids, syskwargs={"device": device}
         )
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
         # Assume dense for simplicity.
-        leaf.tree_node_size = TreeNodeSize(
-            self.shape(), np.prod(self.shape()), block.dtype, block.fill_value
+        leaf.tree_node_meta = LeafMeta(
+            self.shape(),
+            np.prod(self.shape()),
+            block.dtype,
+            block.is_dense,
         )
         leaf.copy_on_op = self.copy_on_op
         return leaf, block
diff --git a/nums/experimental/optimizer/grapharray.py b/nums/experimental/optimizer/grapharray.py
index 5f604118..07de2b6f 100644
--- a/nums/experimental/optimizer/grapharray.py
+++ b/nums/experimental/optimizer/grapharray.py
@@ -23,7 +23,7 @@
 
 from nums.core.settings import sync_nnz
 from nums.core.array import utils as array_utils
-from nums.core.array.base import BlockArrayBase, Block
+from nums.core.array.base import BlockBase, BlockArrayBase
 from nums.core.array.blockarray import BlockArray
 from nums.core.array.sparse import SparseBlockArray
 from nums.core.kernel.kernel_manager import KernelManager
@@ -35,12 +35,13 @@
     Leaf,
     UnaryOp,
     ReduceAxis,
+    FunctionNode,
     Einsum,
 )
 from nums.experimental.optimizer.reduction_ops import TreeReductionOp
 from nums.experimental.optimizer.fusion import FuseGraph
 from nums.experimental.optimizer.fusion_utils import set_using_marker, traverse_marker
-from nums.experimental.optimizer.size import TreeNodeSize
+from nums.experimental.optimizer.node_meta import LeafMeta
 
 
 class GraphArray(object):
@@ -50,7 +51,7 @@ def graphs_from_ba(
     ) -> np.ndarray:
         graphs = np.empty(shape=ba.grid.grid_shape, dtype=np.object)
         for grid_entry in ba.grid.get_entry_iterator():
-            block: Block = ba.blocks[grid_entry]
+            block: BlockBase = ba.blocks[grid_entry]
             # Allocate the block to the node on which it's created.
             km: KernelManager = ba.km
             device: Device = km.device_grid.get_device(
@@ -66,17 +67,18 @@ def graphs_from_ba(
                 nnz = block.nnz  # Blocking fetch
             else:
                 nnz = np.prod(block.shape)
-            leaf.tree_node_size = TreeNodeSize(
+            leaf.tree_node_meta = LeafMeta(
                 block.shape,
                 nnz,
                 block.dtype,
-                block.fill_value,
+                block.is_dense,
             )
+            leaf.dense_kernel = block.is_dense
             leaf.copy_on_op = copy_on_op
             graphs[grid_entry] = leaf
 
             cluster_state.add_block(
-                block.id, leaf.tree_node_size.nbytes, devices=[device]
+                block.id, leaf.tree_node_meta.nbytes, devices=[device]
             )
             cluster_state.init_mem_load(device, block.id)
         return graphs
@@ -110,9 +112,7 @@ def to_ba(self):
         ), "Cannot convert unsolved GraphArray to BlockArray."
         if sample_node.block.is_dense:
             return BlockArray(self.grid.copy(), self.km, self.to_blocks())
-        return SparseBlockArray(
-            self.grid.copy(), self.km, sample_node.block.fill_value, self.to_blocks()
-        )
+        return SparseBlockArray(self.grid.copy(), self.km, self.to_blocks())
 
     def __init__(
         self,
@@ -166,7 +166,7 @@ def iterator(self):
                 yield node
 
     def to_blocks(self) -> np.ndarray:
-        blocks: np.ndarray = np.empty(self.grid.grid_shape, dtype=Block)
+        blocks: np.ndarray = np.empty(self.grid.grid_shape, dtype=BlockBase)
         for grid_entry in self.grid.get_entry_iterator():
             leaf: TreeNode = self.graphs[grid_entry]
             assert isinstance(leaf, Leaf), "%s,%s" % (str(leaf), type(leaf))
@@ -176,28 +176,29 @@ def to_blocks(self) -> np.ndarray:
     def other_to_ga(self, other):
         return GraphArray.to_ga(other, self.cluster_state, self.km, self.copy_on_op)
 
-    def tensordot(self, other, axes=2):
-        other = self.other_to_ga(other)
+    @staticmethod
+    def tensordot(a, b, axes=2):
+        b = a.other_to_ga(b)
         # TODO: Reuse BlockArrayBase tensordot operator.
-        this_axes = self.grid.grid_shape[:-axes]
-        this_sum_axes = self.grid.grid_shape[-axes:]
-        other_axes = other.grid.grid_shape[axes:]
-        other_sum_axes = other.grid.grid_shape[:axes]
-        assert this_sum_axes == other_sum_axes
-        result_shape = tuple(self.shape[:-axes] + other.shape[axes:])
-        result_block_shape = tuple(self.block_shape[:-axes] + other.block_shape[axes:])
+        a_axes = a.grid.grid_shape[:-axes]
+        a_sum_axes = a.grid.grid_shape[-axes:]
+        b_axes = b.grid.grid_shape[axes:]
+        b_sum_axes = b.grid.grid_shape[:axes]
+        assert a_sum_axes == b_sum_axes
+        result_shape = tuple(a.shape[:-axes] + b.shape[axes:])
+        result_block_shape = tuple(a.block_shape[:-axes] + b.block_shape[axes:])
         result_grid = ArrayGrid(
             shape=result_shape,
             block_shape=result_block_shape,
-            dtype=self.dtype.__name__,
+            dtype=a.dtype.__name__,
         )
-        assert result_grid.grid_shape == tuple(this_axes + other_axes)
+        assert result_grid.grid_shape == tuple(a_axes + b_axes)
         result_graphs = np.empty(shape=result_grid.grid_shape, dtype=np.object)
-        this_dims = list(itertools.product(*map(range, this_axes)))
-        other_dims = list(itertools.product(*map(range, other_axes)))
-        sum_dims = list(itertools.product(*map(range, this_sum_axes)))
-        for i in this_dims:
-            for j in other_dims:
+        a_dims = list(itertools.product(*map(range, a_axes)))
+        b_dims = list(itertools.product(*map(range, b_axes)))
+        sum_dims = list(itertools.product(*map(range, a_sum_axes)))
+        for i in a_dims:
+            for j in b_dims:
                 # A \in \R^{I \times K}
                 # B \in \R^{K \times J}
                 # C \in \R^{I \times J}
@@ -205,38 +206,41 @@ def tensordot(self, other, axes=2):
                 grid_entry = tuple(i + j)
                 if len(sum_dims) == 1:
                     k = sum_dims[0]
-                    self_node: TreeNode = self.graphs[tuple(i + k)]
-                    other_node: TreeNode = other.graphs[tuple(k + j)]
-                    dot_node: TreeNode = self_node.tensordot(other_node, axes=axes)
+                    a_node: TreeNode = a.graphs[tuple(i + k)]
+                    b_node: TreeNode = b.graphs[tuple(k + j)]
+                    dot_node: TreeNode = a_node.tensordot(b_node, axes=axes)
                     result_graphs[grid_entry] = dot_node
                 else:
-                    rop = TreeReductionOp(self.cluster_state)
+                    rop = TreeReductionOp(a.cluster_state)
                     rop.set_grid_entry(grid_entry)
                     rop.set_grid_shape(result_grid.grid_shape)
                     rop.op_name = "sum"
-                    rop.copy_on_op = self.copy_on_op
+                    rop.copy_on_op = a.copy_on_op
                     for k in sum_dims:
-                        self_node: TreeNode = self.graphs[tuple(i + k)]
-                        other_node: TreeNode = other.graphs[tuple(k + j)]
-                        dot_node: TreeNode = self_node.tensordot(other_node, axes=axes)
+                        a_node: TreeNode = a.graphs[tuple(i + k)]
+                        b_node: TreeNode = b.graphs[tuple(k + j)]
+                        dot_node: TreeNode = a_node.tensordot(b_node, axes=axes)
                         # Explicitly add parent here, since sum depends on prod.
                         # Not needed for other ops; make_bop takes care of it.
                         # We don't need to copy the node here since the local
                         # tree structure here is never exposed.
                         dot_node.parent = rop
                         rop.add_child(dot_node)
+                    rop.tree_node_meta = dot_node.tree_node_meta
+                    # Assume children are all dense or all sparse.
+                    rop.dense_kernel = dot_node.tree_node_meta.is_dense
                     result_graphs[grid_entry] = rop
 
         return GraphArray(
             result_grid,
-            self.cluster_state,
+            a.cluster_state,
             result_graphs,
-            self.km,
-            copy_on_op=self.copy_on_op,
+            a.km,
+            copy_on_op=a.copy_on_op,
         )
 
     def __matmul__(self, other):
-        return self.tensordot(other, axes=1)
+        return self.tensordot(self, other, axes=1)
 
     def ga_from_arr(self, arr: Union[TreeNode, np.ndarray], result_shape: tuple):
         if isinstance(arr, TreeNode):
@@ -370,6 +374,8 @@ def _add_uop(self, op_name, grid_entry, old_arr, new_arr):
             assert old_root.parent is None
             uop.child = old_root
             old_root.parent = uop
+        uop.tree_node_meta = old_root.tree_node_meta.uop_partial(op_name)
+        uop.dense_kernel = old_root.tree_node_meta.is_dense
         new_arr[grid_entry] = uop
 
     def reduce_axis(self, op_name, axis, keepdims):
@@ -389,6 +395,12 @@ def reduce_axis(self, op_name, axis, keepdims):
             reduced_tnode.op_name = op_name
             reduced_tnode.axis = axis
             reduced_tnode.keepdims = keepdims
+            reduced_tnode.tree_node_meta = tnode.tree_node_meta.reduce_axis_partial(
+                op_name,
+                axis,
+                keepdims,
+            )
+            reduced_tnode.dense_kernel = tnode.tree_node_meta.is_dense
             reduced_graphs[grid_entry] = reduced_tnode
 
         # Compute output GraphArray properties.
@@ -443,6 +455,8 @@ def reduce_axis(self, op_name, axis, keepdims):
                 rop.add_child(child)
                 assert child.parent is None
                 child.parent = rop
+            rop.tree_node_meta = child.tree_node_meta
+            rop.dense_kernel = child.tree_node_meta.is_dense
             if result_grid.shape == ():
                 # keepdims = False.
                 result_graphs[()] = rop
@@ -470,6 +484,8 @@ def reduce_axis(self, op_name, axis, keepdims):
                     rop.add_child(child)
                     assert child.parent is None
                     child.parent = rop
+                rop.tree_node_meta = child.tree_node_meta
+                rop.dense_kernel = child.tree_node_meta.is_dense
                 result_graphs[result_grid_entry] = rop
 
         return GraphArray(
@@ -486,16 +502,22 @@ def sum(self, axis=None, keepdims=False):
     def compile(self, max_args: int):
         result_graphs = np.empty_like(self.graphs, dtype=self.graphs.dtype)
         counter = 0
+        first_grid_entry = (0,) * len(self.grid.shape)
         for grid_entry in self.grid.get_entry_iterator():
             graph = self.graphs[grid_entry]
             _, leaf_inputs = traverse_marker(graph, 0)
-            if grid_entry == (0,) or grid_entry == (0, 0):  # generic
-                result_graphs[grid_entry] = FuseGraph(
-                    graph, self.km, max_args=max_args
-                )()
-                fused_graph = result_graphs[grid_entry]
-                fused_graph.op_expression = fused_graph._expression
+            if grid_entry == first_grid_entry:
+                fused_graph = FuseGraph(graph, self.km, max_args=max_args)()
+                if not isinstance(fused_graph, FunctionNode):
+                    # Stopgap as this function currently assumes root is FunctionNode.
+                    return self
+                result_graphs[grid_entry] = fused_graph
+                # result_graphs[grid_entry] = FuseGraph(
+                #     graph, self.km, max_args=max_args
+                # )()
+                # fused_graph = result_graphs[grid_entry]
             else:
+                # TODO: support subtree fusion in this section. FuseGraph already does.
                 fused_graph_copy = fused_graph.copy(self.cluster_state, new_ids=True)
                 fused_graph_copy = set_using_marker(fused_graph_copy, leaf_inputs)
                 fused_graph_copy.set_grid_entry(grid_entry)
diff --git a/nums/experimental/optimizer/size.py b/nums/experimental/optimizer/node_meta.py
similarity index 63%
rename from nums/experimental/optimizer/size.py
rename to nums/experimental/optimizer/node_meta.py
index e813259b..236fe580 100644
--- a/nums/experimental/optimizer/size.py
+++ b/nums/experimental/optimizer/node_meta.py
@@ -3,55 +3,72 @@
 from nums.core.array import utils as array_utils
 
 
-# TODO: integrate this class with FuseGraph.
-class TreeNodeSize:
+class TreeNodeMeta:
     """
-    Encapsulated by each TreeNode to keep track of estimated or observed sizes of blocks.
+    Encapsulated by each TreeNode to keep track of node metadata like shape and output density.
+    """
+
+    def __init__(self, dtype, is_dense):
+        self.dtype = dtype
+        self.is_dense = is_dense
+
+    def copy(self):
+        return TreeNodeMeta(
+            self.dtype,
+            self.is_dense,
+        )
+
+    def uop_partial(self, op_name):
+        """
+        Does not update nnz, as it can't be estimated for non-leaf nodes.
+        """
+        is_dense = self.is_dense or array_utils.get_sparse_uop_densify(op_name)
+        return TreeNodeMeta(
+            dtype=array_utils.get_uop_output_type(op_name, self.dtype),
+            is_dense=is_dense,
+        )
+
+    def reduce_axis_partial(self, op_name, axis, keepdims):
+        return TreeNodeMeta(
+            dtype=array_utils.get_uop_output_type(op_name, self.dtype),
+            is_dense=True,
+        )
+
+    def bop_partial(self, op_name, other, **kwargs):
+        dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
+        is_dense = array_utils.get_sparse_bop_densify(
+            op_name,
+            self.is_dense,
+            other.is_dense,
+        )
+        return TreeNodeMeta(dtype, is_dense)
+
+
+class LeafMeta(TreeNodeMeta):
+    """
+    Encapsulated by each Leaf to keep track of sizes of blocks in addition to TreeNodeMeta.
     Sparse binary operations use B(n, p) to estimate nnz as needed.
     """
 
-    def __init__(self, shape, nnz, dtype, fill_value=None, index_dtype=np.int64):
+    def __init__(self, shape, nnz, dtype, is_dense, index_dtype=np.int64):
+        super().__init__(dtype, is_dense)
         self.shape = shape
         self.nnz = nnz
-        self.dtype = dtype
-        self.fill_value = fill_value
         self.index_dtype = index_dtype
         if self.is_dense:
             assert nnz == np.prod(shape)
         else:
             assert index_dtype is not None
 
-        self.bop_estimation_map = {
-            "add": self.add,
-            "sum": self.add,
-            "sub": self.add,
-            "mul": self.mul,
-            "prod": self.mul,
-            "truediv": self.truediv,
-            "pow": self.pow,
-            "matmul": lambda other: self.tensordot(other, axes=1),
-            "tensordot": self.tensordot,
-            "lt": lambda other: self.inequality("lt", other),
-            "le": lambda other: self.inequality("le", other),
-            "gt": lambda other: self.inequality("gt", other),
-            "ge": lambda other: self.inequality("ge", other),
-            "eq": lambda other: self.inequality("eq", other),
-            "ne": lambda other: self.inequality("ne", other),
-        }
-
     def copy(self):
-        return TreeNodeSize(
+        return LeafMeta(
             self.shape,
             self.nnz,
             self.dtype,
-            self.fill_value,
+            self.is_dense,
             self.index_dtype,
         )
 
-    @property
-    def is_dense(self):
-        return self.fill_value is None
-
     @property
     def nbytes(self):
         if self.is_dense:
@@ -65,40 +82,49 @@ def nbytes(self):
 
     def uop(self, op_name):
         if op_name == "transpose":
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=tuple(reversed(self.shape)),
                 nnz=self.nnz,
                 dtype=self.dtype,
-                fill_value=self.fill_value,
+                is_dense=self.is_dense,
             )
-        if self.is_dense:
-            fill_value = None
-        else:
-            fill_value = np.__getattribute__(op_name)(self.fill_value)
-        return TreeNodeSize(
+        nnz = self.nnz
+        is_dense = self.is_dense
+        if not is_dense and array_utils.get_sparse_uop_densify(op_name):
+            nnz = np.prod(self.shape)
+            is_dense = True
+        return LeafMeta(
             shape=self.shape,
-            nnz=self.nnz,
+            nnz=nnz,
             dtype=array_utils.get_uop_output_type(op_name, self.dtype),
-            fill_value=fill_value,
+            is_dense=is_dense,
         )
 
-    def reduce_axis(self, op_name, axis, keepdims, transposed):
-        # Assume dense for now
+    def reduce_axis(self, op_name, axis, keepdims):
         shape = list(self.shape)
-        if transposed:
-            shape.reverse()
         if axis is None:
             shape = []
         elif keepdims:
             shape[axis] = 1
         else:
             shape.pop(axis)
-        return TreeNodeSize(
-            shape=tuple(shape),
-            nnz=np.prod(shape),
-            dtype=array_utils.get_uop_output_type(op_name, self.dtype),
-            fill_value=None,
-        )
+        if self.is_dense:
+            return LeafMeta(
+                shape=tuple(shape),
+                nnz=np.prod(shape),
+                dtype=array_utils.get_uop_output_type(op_name, self.dtype),
+                is_dense=True,
+            )
+        else:
+            # If any element in reduced axis is nonzero, result is nonzero.
+            p1 = self.nnz / np.prod(self.shape)
+            nnz = int((1 - (1 - p1) ** self.shape[axis]) * np.prod(shape))
+            return LeafMeta(
+                shape=tuple(shape),
+                nnz=nnz,
+                dtype=array_utils.get_uop_output_type(op_name, self.dtype),
+                is_dense=False,
+            )
 
     def _nnz_disjunction(self, other, shape):
         # If either element is nonzero, result is nonzero.
@@ -118,7 +144,6 @@ def _nnz_conjunction(self, other, shape):
 
     def _nnz_selection(self, other, shape):
         # If self element is nonzero, result is nonzero.
-        assert self.fill_value == 0
         n1 = np.prod(self.shape)
         p1 = self.nnz / n1
         return int(p1 * np.prod(shape))
@@ -126,29 +151,27 @@ def _nnz_selection(self, other, shape):
     def add(self, other):
         shape = array_utils.broadcast_shape(self.shape, other.shape)
         dtype = array_utils.get_bop_output_type("add", self.dtype, other.dtype)
-        is_dense = array_utils.get_sparse_bop_return_type(
+        is_dense = array_utils.get_sparse_bop_densify(
             "add",
-            self.fill_value,
-            other.fill_value,
+            self.is_dense,
+            other.is_dense,
         )
         if is_dense:
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
-                fill_value=None,
+                is_dense=True,
             )
         if self.is_dense or other.is_dense:
             raise ValueError(
                 "TreeNodeSize.__add__ is inconsistent with sparse bop rules."
             )
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=self._nnz_disjunction(other, shape),
             dtype=dtype,
-            fill_value=array_utils.get_bop_fill_value(
-                "add", self.fill_value, other.fill_value
-            ),
+            is_dense=False,
         )
 
     __add__ = add
@@ -156,42 +179,33 @@ def add(self, other):
     def mul(self, other):
         shape = array_utils.broadcast_shape(self.shape, other.shape)
         dtype = array_utils.get_bop_output_type("mul", self.dtype, other.dtype)
-        is_dense = array_utils.get_sparse_bop_return_type(
+        is_dense = array_utils.get_sparse_bop_densify(
             "mul",
-            self.fill_value,
-            other.fill_value,
+            self.is_dense,
+            other.is_dense,
         )
         if is_dense:
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
-                fill_value=None,
+                is_dense=True,
             )
         if not self.is_dense and not other.is_dense:
-            if self.fill_value == 0 and other.fill_value == 0:
-                nnz = self._nnz_conjunction(other, shape)
-            elif self.fill_value == 0:
-                nnz = self._nnz_selection(other, shape)
-            elif other.fill_value == 0:
-                nnz = other._nnz_selection(self, shape)
-            else:
-                nnz = self._nnz_disjunction(other, shape)
-        elif self.fill_value == 0 and other.is_dense:
+            nnz = self._nnz_conjunction(other, shape)
+        elif not self.is_dense and other.is_dense:
             nnz = self._nnz_selection(other, shape)
-        elif self.is_dense and other.fill_value == 0:
+        elif self.is_dense and not other.is_dense:
             nnz = other._nnz_selection(self, shape)
         else:
             raise ValueError(
                 "TreeNodeSize.__mul__ is inconsistent with sparse bop rules."
             )
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=nnz,
             dtype=dtype,
-            fill_value=array_utils.get_bop_fill_value(
-                "mul", self.fill_value, other.fill_value
-            ),
+            is_dense=False,
         )
 
     __mul__ = mul
@@ -199,35 +213,33 @@ def mul(self, other):
     def truediv(self, other):
         shape = array_utils.broadcast_shape(self.shape, other.shape)
         dtype = array_utils.get_bop_output_type("truediv", self.dtype, other.dtype)
-        is_dense = array_utils.get_sparse_bop_return_type(
+        is_dense = array_utils.get_sparse_bop_densify(
             "truediv",
-            self.fill_value,
-            other.fill_value,
+            self.is_dense,
+            other.is_dense,
         )
         if is_dense:
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
-                fill_value=None,
+                is_dense=True,
             )
         if self.is_dense or other.is_dense:
             raise ValueError(
                 "TreeNodeSize.__add__ is inconsistent with sparse bop rules."
             )
-        if other.fill_value == 0:  # nan
+        if not other.is_dense:  # nan
             nnz = other._nnz_selection(self, shape)
-        elif self.fill_value == 0:
+        elif not self.is_dense:
             nnz = self._nnz_selection(other, shape)
         else:
             nnz = self._nnz_disjunction(other, shape)
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=nnz,
             dtype=dtype,
-            fill_value=array_utils.get_bop_fill_value(
-                "truediv", self.fill_value, other.fill_value
-            ),
+            is_dense=False,
         )
 
     __truediv__ = truediv
@@ -248,22 +260,17 @@ def tensordot(self, other, axes):
             shape = tuple(self.shape + other.shape)
             sum_shape = (1,)
         dtype = array_utils.get_bop_output_type("tensordot", self.dtype, other.dtype)
-        is_dense = array_utils.get_sparse_bop_return_type(
-            "tensordot", self.fill_value, other.fill_value
+        is_dense = array_utils.get_sparse_bop_densify(
+            "tensordot", self.is_dense, other.is_dense
         )
         if is_dense:
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
-                fill_value=None,
+                is_dense=True,
             )
-        if (
-            self.is_dense
-            or other.is_dense
-            or self.fill_value != 0
-            or other.fill_value != 0
-        ):
+        if self.is_dense or other.is_dense:
             raise ValueError(
                 "TreeNodeSize.tensordot is inconsistent with sparse bop rules."
             )
@@ -273,36 +280,50 @@ def tensordot(self, other, axes):
         p2 = other.nnz / n2
         m = np.prod(shape)
         k = np.prod(sum_shape)
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=int((1 - (1 - p1 * p2) ** k) * m),
             dtype=dtype,
-            fill_value=0,
+            is_dense=False,
         )
 
     def inequality(self, op_name, other):
         assert other.shape == ()
         dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
-        fill_value = array_utils.get_bop_fill_value(
-            op_name, self.fill_value, other.fill_value
-        )
-        return TreeNode(
+        return LeafMeta(
             shape=self.shape,
             nnz=self.nnz,
             dtype=dtype,
-            fill_value=fill_value,
+            is_dense=self.is_dense,
         )
 
-    def bop_dense(self, other):
+    def bop_dense(self, other, **kwargs):
         # NOTE: as catch-all fallback, this may be wrong for unknown non-elementwise binary ops.
         shape = array_utils.broadcast_shape(self.shape, other.shape)
         dtype = array_utils.get_bop_output_type("add", self.dtype, other.dtype)
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=np.prod(shape),
             dtype=dtype,
-            fill_value=None,
+            is_dense=False,
         )
 
     def bop(self, op_name, other, **kwargs):
-        return self.bop_estimation_map.get(op_name, self.bop_dense)(other, **kwargs)
+        bop_estimation_map = {
+            "add": self.add,
+            "sum": self.add,
+            "sub": self.add,
+            "mul": self.mul,
+            "prod": self.mul,
+            "truediv": self.truediv,
+            "pow": self.pow,
+            "matmul": lambda other: self.tensordot(other, axes=1),
+            "tensordot": self.tensordot,
+            "lt": lambda other: self.inequality("lt", other),
+            "le": lambda other: self.inequality("le", other),
+            "gt": lambda other: self.inequality("gt", other),
+            "ge": lambda other: self.inequality("ge", other),
+            "eq": lambda other: self.inequality("eq", other),
+            "ne": lambda other: self.inequality("ne", other),
+        }
+        return bop_estimation_map.get(op_name, self.bop_dense)(other, **kwargs)
diff --git a/nums/experimental/optimizer/reduction_ops.py b/nums/experimental/optimizer/reduction_ops.py
index d5879d15..50228ad3 100644
--- a/nums/experimental/optimizer/reduction_ops.py
+++ b/nums/experimental/optimizer/reduction_ops.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from nums.core.settings import sync_nnz
-from nums.core.array.base import Block
+from nums.core.array.base import BlockBase
 from nums.core.grid.grid import Device
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.experimental.optimizer.clusterstate import ClusterState
@@ -41,10 +41,11 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None, seed=1337):
         self.action_leaf_q = []
 
     def __repr__(self):
-        return "Reduc(id=%s, op=%s, in=%d)" % (
+        return "Reduc(id=%s, op=%s, in=%d, dense_kernel=%s)" % (
             str(self.tree_node_id),
             self.op_name,
             len(self.children_dict),
+            self.dense_kernel,
         )
 
     def get_children(self):
@@ -83,6 +84,9 @@ def copy(self, cluster_state, parent=None, new_ids=False):
             if child.tree_node_id in self.leafs_dict:
                 rop.leafs_dict[child_copy.tree_node_id] = child_copy
         # TODO (hme): How do we properly copy random state?
+
+        rop.tree_node_meta = self.tree_node_meta
+        rop.dense_kernel = self.dense_kernel
         return rop
 
     def add_child(self, child: TreeNode):
@@ -215,7 +219,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> TreeNode:
         assert isinstance(left, Leaf) and isinstance(right, Leaf)
         result = self._collapse(device, left, right)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
 
         # Update action leaf queue.
         assert set(leaf_ids) == {self.action_leaf_q.pop(0), self.action_leaf_q.pop(0)}
@@ -231,7 +235,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> TreeNode:
         # Update cluster state with new block.
         # self.cluster_state.add_block(new_block.id, new_block.size(), [device])
         self.cluster_state.add_block(
-            new_block.id, new_leaf.tree_node_size.nbytes, [device]
+            new_block.id, new_leaf.tree_node_meta.nbytes, [device]
         )
         if not self.cluster_state.created_on_only:
             assert self.cluster_state.blocks_local(left.block.id, right.block.id)
@@ -257,30 +261,42 @@ def execute_on(self, device: Device, leaf_ids=None) -> TreeNode:
         return self
 
     def _collapse(self, device: Device, left: Leaf, right: Leaf):
-        lblock: Block = left.block
-        rblock: Block = right.block
+        lblock: BlockBase = left.block
+        rblock: BlockBase = right.block
         if self.op_name == "matmul":
             raise ValueError("matmul is not a supported reduction operator.")
         op_name, args = self.op_name, {}
         assert lblock.shape == rblock.shape
-        block: Block = lblock.copy()
+        block: BlockBase = lblock.copy()
         block.transposed = False
         block.dtype = array_utils.get_reduce_output_type(self.op_name, lblock.dtype)
-        block.oid = lblock._km.bop_reduce(
-            op_name,
-            lblock.oid,
-            rblock.oid,
-            lblock.transposed,
-            rblock.transposed,
-            syskwargs={"device": device},
-        )
+        if self.dense_kernel:
+            assert lblock.is_dense and rblock.is_dense
+            block.oid = lblock.km.bop_reduce(
+                op_name,
+                lblock.oid,
+                rblock.oid,
+                lblock.transposed,
+                rblock.transposed,
+                syskwargs={"device": device},
+            )
+        else:
+            assert not lblock.is_dense or not rblock.is_dense
+            block.oid = lblock.km.sparse_bop_reduce(
+                op_name,
+                lblock.oid,
+                rblock.oid,
+                lblock.transposed,
+                rblock.transposed,
+                syskwargs={"device": device},
+            )
         block._device = device
 
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
-        leaf.tree_node_size = left.tree_node_size.bop(
+        leaf.tree_node_meta = left.tree_node_meta.bop(
             op_name,
-            right.tree_node_size,
+            right.tree_node_meta,
             **args,
         )
         leaf.copy_on_op = self.copy_on_op
@@ -294,22 +310,22 @@ def _mem_cost(self, leafs):
         shape = None
         for leaf in leafs:
             assert leaf.tree_node_id in self.leafs_dict
-            leaf_block: Block = leaf.block
+            leaf_block: BlockBase = leaf.block
             if shape is None:
                 shape = leaf_block.shape
             else:
                 assert leaf_block.shape == shape
-        leaf_block: Block = leafs[0].block
+        leaf_block: BlockBase = leafs[0].block
         if leaf_block.is_dense:
             return leaf_block.size()
         if sync_nnz > 1:
-            leafs[0].tree_node_size.nnz = leafs[0].block.nnz  # Blocking fetch
-            leafs[1].tree_node_size.nnz = leafs[1].block.nnz  # Blocking fetch
+            leafs[0].tree_node_meta.nnz = leafs[0].block.nnz  # Blocking fetch
+            leafs[1].tree_node_meta.nnz = leafs[1].block.nnz  # Blocking fetch
         return (
             leafs[0]
-            .tree_node_size.bop(
+            .tree_node_meta.bop(
                 self.op_name,
-                leafs[1].tree_node_size,
+                leafs[1].tree_node_meta,
             )
             .nbytes
         )
@@ -350,10 +366,14 @@ def expression(self):
             # This will force a different hash for large fused reductions,
             # if this is, for whatever reason, needed.
             size = len(self.children_dict)
-            self._expression = "TreeReductionOp(op=%s, size=%s, id=%s)" % (
-                self.op_name,
-                str(size),
-                self.tree_node_id,
+            self._expression = (
+                "TreeReductionOp(op=%s, size=%s, id=%s, dense_kernel=%s)"
+                % (
+                    self.op_name,
+                    str(size),
+                    self.tree_node_id,
+                    self.dense_kernel,
+                )
             )
         return self._expression
 
diff --git a/nums/numpy/api/creation.py b/nums/numpy/api/creation.py
index a76e815c..5257e38b 100644
--- a/nums/numpy/api/creation.py
+++ b/nums/numpy/api/creation.py
@@ -227,7 +227,7 @@ def array(object, dtype=None, copy=True, order="K", ndmin=0, subok=False) -> Blo
 
 
 def from_coo(a: sparse.COO):
-    assert(isinstance(a, sparse.COO))
+    assert isinstance(a, sparse.COO)
     dtype = np.__getattribute__(str(a.dtype))
     shape = a.shape
     app = _instance()
diff --git a/nums/numpy/random.py b/nums/numpy/random.py
index 4f031df5..edd7d6a4 100644
--- a/nums/numpy/random.py
+++ b/nums/numpy/random.py
@@ -79,25 +79,20 @@ def permutation(self, x):
             arr_perm = self.rs().permutation(shape[0], block_shape[0]).get()
             return x[arr_perm]
 
-    def sparse_randn(self, density, fill_value, *shape):
+    def sparse_randn(self, density, *shape):
         shape, block_shape = self._get_shapes(shape, _np.float64)
         return self.rs().sparse_normal(
             shape=shape,
             block_shape=block_shape,
             p=density,
-            fill_value=fill_value,
         )
 
-    def sparse_randint(
-        self, low, high=None, size=None, dtype=int, density=0.01, fill_value=0
-    ):
+    def sparse_randint(self, low, high=None, size=None, dtype=int, density=0.01):
         if high is None:
             high = low
             low = 0
         shape, block_shape = self._get_shapes(size, dtype)
-        return self.rs().sparse_randint(
-            low, high, dtype, shape, block_shape, density, fill_value
-        )
+        return self.rs().sparse_randint(low, high, dtype, shape, block_shape, density)
 
 
 # Default imp.
diff --git a/tests/core/array/test_bop.py b/tests/core/array/test_bop.py
index d14da1b0..166c4a8e 100644
--- a/tests/core/array/test_bop.py
+++ b/tests/core/array/test_bop.py
@@ -69,7 +69,7 @@ def test_tensordot_basic(app_inst: ArrayApplication):
     shape = 2, 4, 10, 15
     npX = np.arange(np.product(shape)).reshape(*shape)
     rX = app_inst.array(npX, block_shape=(1, 2, 10, 3))
-    rResult = rX.T.tensordot(rX, axes=1)
+    rResult = rX.tensordot(rX.T, rX, axes=1)
     assert np.allclose(rResult.get(), (np.tensordot(npX.T, npX, axes=1)))
     common.check_block_integrity(rResult)
 
@@ -93,7 +93,7 @@ def test_tensordot_large_shape(app_inst: ArrayApplication):
 
     block_a = app_inst.array(a, block_shape=(30, 5, 3, 2))
     block_b = app_inst.array(b, block_shape=(2, 3, 5, 25))
-    block_c = block_a.tensordot(block_b, axes=1)
+    block_c = block_a.tensordot(block_a, block_b, axes=1)
     assert np.allclose(block_c.get(), c)
     common.check_block_integrity(block_c)
 
@@ -129,7 +129,7 @@ def test_tensordot_all_shapes(app_inst: ArrayApplication):
                 )
                 block_a = app_inst.array(a, block_shape=a_block_shape)
                 block_b = app_inst.array(b, block_shape=b_block_shape)
-                block_c = block_a.tensordot(block_b, axes=axes)
+                block_c = block_a.tensordot(block_a, block_b, axes=axes)
                 assert np.allclose(block_c.get(), c)
                 common.check_block_integrity(block_c)
 
diff --git a/tests/core/array/test_sparse.py b/tests/core/array/test_sparse.py
index 4ef9bc06..b03c8472 100644
--- a/tests/core/array/test_sparse.py
+++ b/tests/core/array/test_sparse.py
@@ -1,9 +1,7 @@
 import numpy as np
 import sparse
-import pytest
 
 from nums.core.array.application import ArrayApplication
-from nums.core.array.blockarray import BlockArray
 from nums.core.array.sparse import SparseBlockArray
 from nums.core.array.random import NumsRandomState
 
@@ -11,9 +9,8 @@
 def test_sparse_init(app_inst: ArrayApplication):
     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
     x_ba = app_inst.array(x1, block_shape=(2, 2))
-    x_sba = SparseBlockArray.from_ba(x_ba, fill_value=0)
+    x_sba = SparseBlockArray.from_ba(x_ba)
     assert x_sba.nnz == 8
-    assert x_sba.nbytes == 8 * 4 + 2 * 8 * 8
     y_ba = x_sba.to_ba()
     assert np.array_equal(x1, y_ba.get())
 
@@ -21,12 +18,11 @@ def test_sparse_init(app_inst: ArrayApplication):
 def test_from_coo(app_inst: ArrayApplication):
     row_coords = [0, 1, 0, 1, 2, 2, 3, 3]
     col_coords = [3, 2, 2, 3, 0, 1, 0, 1]
-    values     = [1, 1, 1, 1, 2, 2, 2, 2]
+    values = [1, 1, 1, 1, 2, 2, 2, 2]
     x_sp = sparse.COO([row_coords, col_coords], values)
     x_de = x_sp.todense()
     x_sba = app_inst.array(x_sp, block_shape=(2, 2))
     assert x_sba.nnz == 8
-    assert x_sba.nbytes == 8 * 4 + 2 * 8 * 8
     y_ba = x_sba.to_ba()
     assert np.array_equal(x_de, y_ba.get())
 
@@ -34,24 +30,26 @@ def test_from_coo(app_inst: ArrayApplication):
 def test_sparse_random(app_inst: ArrayApplication):
     rs: NumsRandomState = app_inst.random_state(1337)
     x_sba = rs.sparse_randint(
-        1, high=5, dtype=int, shape=(15, 10), block_shape=(5, 5), p=0.1, fill_value=0
+        1,
+        high=5,
+        dtype=np.int64,
+        shape=(100, 50),
+        block_shape=(50, 50),
+        p=0.1,
     )
     x_ba = x_sba.to_ba()
     x_np = x_ba.get()
     x_sp = sparse.COO.from_numpy(x_np, fill_value=0)
     assert x_sba.nnz == x_sp.nnz
-    assert x_sba.nbytes == x_sp.nbytes
-    print(x_np)
 
 
 def test_sparse_uop(app_inst: ArrayApplication):
     x = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
-    x_sp = sparse.COO.from_numpy(x, fill_value=1)
+    x_sp = sparse.COO.from_numpy(x, fill_value=0)
     x_sp = sparse.elemwise(np.negative, x_sp)
     x_ba = app_inst.array(x, block_shape=(2, 2))
-    x_sba = SparseBlockArray.from_ba(x_ba, fill_value=1)
+    x_sba = SparseBlockArray.from_ba(x_ba)
     y_sba = -x_sba
-    assert y_sba.fill_value == x_sp.fill_value  # -1
     assert y_sba.nnz == x_sp.nnz  # 12
     y_ba = y_sba.to_ba()
     assert np.array_equal(np.negative(x), y_ba.get())
@@ -60,44 +58,46 @@ def test_sparse_uop(app_inst: ArrayApplication):
 def test_sparse_add(app_inst: ArrayApplication):
     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
     x2 = x1 * 2
-    x1_sp = sparse.COO.from_numpy(x1, fill_value=2)
-    x2_sp = sparse.COO.from_numpy(x2, fill_value=2)
+    x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
+    x2_sp = sparse.COO.from_numpy(x2, fill_value=0)
     x1_ba = app_inst.array(x1, block_shape=(2, 2))
     x2_ba = app_inst.array(x2, block_shape=(2, 2))
-    x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2)
-    x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=2)
+    x1_sba = SparseBlockArray.from_ba(x1_ba)
+    x2_sba = SparseBlockArray.from_ba(x2_ba)
     y_sp = x1_sp + x2_sp
     y_sba = x1_sba + x2_sba
-    assert y_sba.fill_value == y_sp.fill_value  # 4
     assert y_sba.nnz == y_sp.nnz  # 16
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 + x2, y_ba.get())
 
-    # Test sparse-dense.
+    # Test dense-sparse.
     y_ba = x2_ba + x1_sba  # __radd__
     assert np.array_equal(x2 + x1_sp, y_ba.get())
+    y_ba = x1_sba - x2_ba  # __sub__
+    assert np.array_equal(x1_sp - x2, y_ba.get())
+    y_ba = x2_ba - x1_sba  # __rsub__
+    assert np.array_equal(x2 - x1_sp, y_ba.get())
 
     # Test sparse-scalar.
-    y_sp = x1_sp - 1
-    y_sba = x1_sba - 1
-    assert y_sba.fill_value == y_sp.fill_value  # 4
-    assert y_sba.nnz == y_sp.nnz  # 16
-    y_ba = y_sba.to_ba()
-    assert np.array_equal(x1 - 1, y_ba.get())
+    y = (x1_sp - 1).todense()  # __sub__
+    y_ba = x1_sba - 1
+    assert np.array_equal(y, y_ba.get())
+    y = (1 - x1_sp).todense()  # __rsub__
+    y_ba = 1 - x1_sba
+    assert np.array_equal(y, y_ba.get())
 
 
 def test_sparse_mul(app_inst: ArrayApplication):
     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
     x2 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])
-    x1_sp = sparse.COO.from_numpy(x1, fill_value=2)
-    x2_sp = sparse.COO.from_numpy(x2, fill_value=2)
+    x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
+    x2_sp = sparse.COO.from_numpy(x2, fill_value=0)
     x1_ba = app_inst.array(x1, block_shape=(2, 2))
     x2_ba = app_inst.array(x2, block_shape=(2, 2))
-    x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2)
-    x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=2)
+    x1_sba = SparseBlockArray.from_ba(x1_ba)
+    x2_sba = SparseBlockArray.from_ba(x2_ba)
     y_sp = x1_sp * x2_sp
     y_sba = x1_sba * x2_sba
-    assert y_sba.fill_value == y_sp.fill_value  # 4
     assert y_sba.nnz == y_sp.nnz  # 16
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 * x2, y_ba.get())
@@ -105,10 +105,10 @@ def test_sparse_mul(app_inst: ArrayApplication):
     # Test sparse-dense.
     rs: NumsRandomState = app_inst.random_state(1337)
     x1_sba = rs.sparse_randint(
-        1, high=5, dtype=int, shape=(100, 50), block_shape=(5, 5), p=0.1, fill_value=0
+        1, high=5, dtype=int, shape=(100, 50), block_shape=(5, 5), p=0.1
     )
     x2_sba = rs.sparse_randint(
-        1, high=5, dtype=int, shape=(1, 50), block_shape=(5, 5), p=0.1, fill_value=0
+        1, high=5, dtype=int, shape=(1, 50), block_shape=(5, 5), p=0.1
     )
     x1_ba = x1_sba.to_ba()
     x2_ba = x2_sba.to_ba()
@@ -118,7 +118,6 @@ def test_sparse_mul(app_inst: ArrayApplication):
     x2_sp = sparse.COO.from_numpy(x2, fill_value=0)
     y_sp = x1_sp * x2
     y_sba = x2_ba * x1_sba  # __rmul__
-    assert y_sba.fill_value == y_sp.fill_value
     assert y_sba.nnz == y_sp.nnz
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 * x2, y_ba.get())
@@ -126,33 +125,34 @@ def test_sparse_mul(app_inst: ArrayApplication):
 
 def test_neq(app_inst: ArrayApplication):
     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
-    x1_sp = sparse.COO.from_numpy(x1, fill_value=2)
+    x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
     x1_ba = app_inst.array(x1, block_shape=(2, 2))
-    x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2)
+    x1_sba = SparseBlockArray.from_ba(x1_ba)
     y_sp = x1_sp > 0
     y_sba = x1_sba > 0
-    assert y_sba.fill_value == y_sp.fill_value  # True
     assert y_sba.nnz == y_sp.nnz  # 8 (nnz changes!)
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 > 0, y_ba.get())
 
 
 def test_tensordot(app_inst: ArrayApplication):
-    x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
+    x1 = np.array([[0, 0, 1, 2], [0, 0, 3, 4], [5, 6, 0, 0], [7, 8, 0, 0]])
     x2 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])
     x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
     x2_sp = sparse.COO.from_numpy(x2, fill_value=0)
     x1_ba = app_inst.array(x1, block_shape=(2, 2))
     x2_ba = app_inst.array(x2, block_shape=(2, 2))
-    x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=0)
-    x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=0)
+    x1_sba = SparseBlockArray.from_ba(x1_ba)
+    x2_sba = SparseBlockArray.from_ba(x2_ba)
     y_sp = sparse.tensordot(x1_sp, x2_sp, axes=1)
-    y_sba = x1_sba.tensordot(x2_sba, axes=1)
-    assert y_sba.fill_value == y_sp.fill_value  # 0
+    y_sba = x1_sba @ x2_sba
     assert y_sba.nnz == y_sp.nnz  #
     y_ba = y_sba.to_ba()
     assert np.array_equal(np.tensordot(x1, x2, axes=1), y_ba.get())
 
+    y_ba = x1_ba @ x2_sba  # __rmatmul__
+    assert np.array_equal(np.tensordot(x1, x2, axes=1), y_ba.get())
+
 
 def test_sdtp(app_inst: ArrayApplication):
     shape = 50, 50, 50
diff --git a/tests/experimental/optimizer/test_copy.py b/tests/experimental/optimizer/test_copy.py
index 5de7b42a..ea31aac6 100644
--- a/tests/experimental/optimizer/test_copy.py
+++ b/tests/experimental/optimizer/test_copy.py
@@ -88,7 +88,7 @@ def tensordot(
     cluster_state = ClusterState(lhs.km.devices())
     lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state, copy_on_op=copy_on_op)
     rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state, copy_on_op=copy_on_op)
-    return lhs_ga.tensordot(rhs_ga, axes=axes)
+    return GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes)
 
 
 def optimized_tensordot(
diff --git a/tests/experimental/optimizer/test_fusion.py b/tests/experimental/optimizer/test_fusion.py
index 7661fdae..4ced2e78 100644
--- a/tests/experimental/optimizer/test_fusion.py
+++ b/tests/experimental/optimizer/test_fusion.py
@@ -26,12 +26,14 @@
 
 from nums.core.array.application import BlockArray
 from nums.core.array.base import BlockArrayBase
+from nums.core.array.sparse import SparseBlockArray
 from nums.experimental.optimizer.clusterstate import ClusterState
 from nums.experimental.optimizer.grapharray import (
     GraphArray,
 )
 from nums.experimental.optimizer.tree_search import RandomTS
 from nums.experimental.optimizer.fusion import FuseGraph
+from nums.experimental.optimizer.fusion_utils import print_graph
 from nums.experimental.optimizer.graph import TreeNode, Leaf, FunctionNode
 import conftest
 
@@ -45,12 +47,8 @@ def fusion2(app, x, y):
     return x @ y
 
 
-def fusion3(app, s, q, p):
-    return s * (q @ p.T)
-
-
 def ga_op(
-    app, func, x: BlockArray, y: BlockArray, copy_on_op=True, max_args=2
+    app, func, x: BlockArrayBase, y: BlockArrayBase, copy_on_op=True, max_args=2
 ) -> BlockArray:
     cluster_state: ClusterState = ClusterState(x.km.devices())
     x_ga: GraphArray = GraphArray.from_ba(x, cluster_state, copy_on_op=copy_on_op)
@@ -69,27 +67,6 @@ def ga_op(
     return BlockArray(result_ga.grid, x.km, result_ga.to_blocks())
 
 
-def ga_op_sampled_dense_dense(
-    app, func, s: BlockArray, p: BlockArray, q: BlockArray, copy_on_op=True, max_args=3
-) -> BlockArray:
-    cluster_state: ClusterState = ClusterState(s.km.devices())
-    s_ga: GraphArray = GraphArray.from_ba(s, cluster_state, copy_on_op=copy_on_op)
-    p_ga: GraphArray = GraphArray.from_ba(p, cluster_state, copy_on_op=copy_on_op)
-    q_ga: GraphArray = GraphArray.from_ba(q, cluster_state, copy_on_op=copy_on_op)
-    op_ga: GraphArray = func(app, s_ga, p_ga, q_ga)
-    start_time = time.time()
-    fused_ga: GraphArray = op_ga.compile(max_args)
-    end_time = time.time()
-    result_ga: GraphArray = RandomTS(
-        seed=conftest.rs,
-        max_samples_per_step=1,
-        max_reduction_pairs=1,
-        force_final_action=True,
-    ).solve(fused_ga)
-
-    return BlockArray(result_ga.grid, s.km, result_ga.to_blocks())
-
-
 def test_fusion(app_inst_mock_none):
     app = app_inst_mock_none
     x_shape, x_block_shape = (10,), (5,)
@@ -140,24 +117,132 @@ def test_tensordot(app_inst_mock_none):
         assert block.dtype == opt_z.dtype
 
 
-def test_sparse_array(app_inst_mock_none):
+def ga_op_sparse_2(
+    app, func, x: BlockArrayBase, y: BlockArrayBase, copy_on_op=True, max_args=2
+) -> SparseBlockArray:
+    cluster_state: ClusterState = ClusterState(x.km.devices())
+    x_ga: GraphArray = GraphArray.from_ba(x, cluster_state, copy_on_op=copy_on_op)
+    y_ga: GraphArray = GraphArray.from_ba(y, cluster_state, copy_on_op=copy_on_op)
+    op_ga: GraphArray = func(app, x_ga, y_ga)
+    start_time = time.time()
+    fused_ga: GraphArray = op_ga.compile(max_args)
+    end_time = time.time()
+    result_ga: GraphArray = RandomTS(
+        seed=conftest.rs,
+        max_samples_per_step=1,
+        max_reduction_pairs=1,
+        force_final_action=True,
+    ).solve(fused_ga)
+
+    return SparseBlockArray(result_ga.grid, x.km, result_ga.to_blocks())
+
+
+def spmm(app, p, q):
+    return p @ q
+
+
+def test_spmm(app_inst_mock_none):
     app = app_inst_mock_none
-    q_shape, q_block_shape = (10, 2), (2, 2)
-    p_shape, p_block_shape = (20, 2), (2, 2)
-    s_shape, s_block_shape = (10, 20), (2, 2)
-    real_q = np.random.random(np.product(q_shape)).reshape(q_shape)
+    p_shape, p_block_shape = (10, 4), (2, 2)
+    q_shape, q_block_shape = (4, 10), (2, 2)
+    p: SparseBlockArray = app.random.sparse_normal(
+        shape=p_shape, block_shape=p_block_shape, p=0.1
+    )
+    q: SparseBlockArray = app.random.sparse_normal(
+        shape=q_shape, block_shape=q_block_shape, p=0.1
+    )
+    real_p = p.to_ba().get()
+    real_q = q.to_ba().get()
+    z: SparseBlockArray = spmm(app, p, q)
+    start_time = time.time()
+    opt_z: SparseBlockArray = ga_op_sparse_2(app, spmm, p, q)
+    end_time = time.time()
+    print(end_time - start_time)
+    assert z.nnz == opt_z.nnz
+    assert np.allclose(z.to_ba().get(), spmm(np, real_p, real_q))
+    assert app.allclose(z.to_ba(), opt_z.to_ba()).get()
+
+
+def ga_op_sparse_3(
+    app,
+    func,
+    s: BlockArrayBase,
+    p: BlockArrayBase,
+    q: BlockArrayBase,
+    copy_on_op=True,
+    max_args=3,
+) -> SparseBlockArray:
+    cluster_state: ClusterState = ClusterState(s.km.devices())
+    s_ga: GraphArray = GraphArray.from_ba(s, cluster_state, copy_on_op=copy_on_op)
+    p_ga: GraphArray = GraphArray.from_ba(p, cluster_state, copy_on_op=copy_on_op)
+    q_ga: GraphArray = GraphArray.from_ba(q, cluster_state, copy_on_op=copy_on_op)
+    op_ga: GraphArray = func(app, s_ga, p_ga, q_ga)
+    start_time = time.time()
+    fused_ga: GraphArray = op_ga.compile(max_args)
+    end_time = time.time()
+    print(fused_ga.graphs[0, 0])
+    result_ga: GraphArray = RandomTS(
+        seed=conftest.rs,
+        max_samples_per_step=1,
+        max_reduction_pairs=1,
+        force_final_action=True,
+    ).solve(fused_ga)
+
+    return SparseBlockArray(result_ga.grid, s.km, result_ga.to_blocks())
+
+
+def sparse_fusion(app, s, p, q):
+    return (s + s) * p * q
+
+
+def test_sparse_fusion(app_inst_mock_none):
+    app = app_inst_mock_none
+    s_shape, s_block_shape = (10, 4), (2, 2)
+    p_shape, p_block_shape = (10, 4), (2, 2)
+    q_shape, q_block_shape = (10, 4), (2, 2)
     real_p = np.random.random(np.product(p_shape)).reshape(p_shape)
-    real_s = np.random.random(np.product(s_shape)).reshape(s_shape)
+    real_q = np.random.random(np.product(q_shape)).reshape(q_shape)
+    s: SparseBlockArray = app.random.sparse_normal(
+        shape=s_shape, block_shape=s_block_shape, p=0.1
+    )
+    p: BlockArray = app.array(real_p, p_block_shape)
     q: BlockArray = app.array(real_q, q_block_shape)
+    real_s = s.to_ba().get()
+    z: SparseBlockArray = sparse_fusion(app, s, p, q)
+    start_time = time.time()
+    opt_z: SparseBlockArray = ga_op_sparse_3(app, sparse_fusion, s, p, q)
+    end_time = time.time()
+    print(end_time - start_time)
+    assert z.nnz == opt_z.nnz
+    assert np.allclose(z.to_ba().get(), sparse_fusion(np, real_s, real_p, real_q))
+    assert app.allclose(z.to_ba(), opt_z.to_ba()).get()
+
+
+def sddmm(app, s, p, q):
+    return s * (p @ q.T)
+
+
+def test_sddmm(app_inst_mock_none):
+    app = app_inst_mock_none
+    s_shape, s_block_shape = (20, 10), (2, 2)
+    p_shape, p_block_shape = (20, 4), (2, 2)
+    q_shape, q_block_shape = (10, 4), (2, 2)
+    real_p = np.random.random(np.product(p_shape)).reshape(p_shape)
+    real_q = np.random.random(np.product(q_shape)).reshape(q_shape)
+    s: SparseBlockArray = app.random.sparse_normal(
+        shape=s_shape, block_shape=s_block_shape, p=0.1
+    )
     p: BlockArray = app.array(real_p, p_block_shape)
-    s: BlockArray = app.array(real_s, s_block_shape)
-    z: BlockArray = fusion3(app, s, q, p)
+    q: BlockArray = app.array(real_q, q_block_shape)
+    real_s = s.to_ba().get()
+    z: SparseBlockArray = sddmm(app, s, p, q)
     start_time = time.time()
-    opt_z: BlockArray = ga_op_sampled_dense_dense(app, fusion3, s, q, p)
+    opt_z: SparseBlockArray = ga_op_sparse_3(app, sddmm, s, p, q)
     end_time = time.time()
     print(end_time - start_time)
-    assert np.allclose(z.get(), fusion3(np, real_s, real_q, real_p))
-    assert app.allclose(z, opt_z).get()
+    assert z.nnz == opt_z.nnz
+    assert np.allclose(z.to_ba().get(), sddmm(np, real_s, real_p, real_q))
+    assert app.allclose(z.to_ba(), opt_z.to_ba()).get()
 
 
 if __name__ == "__main__":
@@ -165,8 +250,8 @@ def test_sparse_array(app_inst_mock_none):
 
     app = conftest.mock_ray_cluster((1, 1))
     # test_sparse_array(app)
-    # test_fusion(app)
-    test_tensordot_variant2(app)
+    test_fusion(app)
+    # test_tensordot_variant2(app)
     conftest.destroy_mock_cluster(app)
 
     # app = conftest.mock_cluster((10, 1))
diff --git a/tests/experimental/optimizer/test_size.py b/tests/experimental/optimizer/test_meta.py
similarity index 71%
rename from tests/experimental/optimizer/test_size.py
rename to tests/experimental/optimizer/test_meta.py
index a5606c6b..2b06ba78 100644
--- a/tests/experimental/optimizer/test_size.py
+++ b/tests/experimental/optimizer/test_meta.py
@@ -1,98 +1,99 @@
 import numpy as np
 import sparse
 
-from nums.experimental.optimizer.size import TreeNodeSize
+from nums.experimental.optimizer.node_meta import LeafMeta
 
 
 def test_nbytes():
     x1 = np.eye(10)
     x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
-        0,
+        False,
     )
     assert x1_sp.nbytes == x1_ts.nbytes
 
 
 def test_uop():
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
-        1,
+        False,
     )
     y_ts = x1_ts.uop("negative")
     assert np.allclose(y_ts.nnz, x1_ts.nnz)
-    assert y_ts.fill_value == -1
+    assert not y_ts.is_dense
 
 
 def test_add():
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
-        1,
+        False,
     )
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 10),
         20,
         np.int64,
-        2,
+        False,
     )
     y_ts = x1_ts + x2_ts
     assert np.allclose(y_ts.nnz, int((1 - 0.9 * 0.8) * 100))
-    assert y_ts.fill_value == 3
+    assert not y_ts.is_dense
 
 
 def test_mul():
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
-        0,
+        False,
     )
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 10),
         20,
         np.int64,
-        0,
+        False,
     )
     y_ts = x1_ts * x2_ts
     assert np.allclose(y_ts.nnz, int(0.1 * 0.2 * 100))
 
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 1),
         10,
         np.int64,
-        1,
+        False,
     )
     y_ts = x1_ts * x2_ts
     assert np.allclose(y_ts.nnz, int(0.1 * 100))
 
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 1),
         10,
         np.int64,
-        None,
+        True,
     )
     y_ts = x1_ts * x2_ts
     assert np.allclose(y_ts.nnz, int(0.1 * 100))
+    assert not y_ts.is_dense
 
 
 def test_tensordot():
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
-        0,
+        False,
     )
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 10),
         20,
         np.int64,
-        0,
+        False,
     )
     y_ts = x1_ts.tensordot(x2_ts, axes=1)
     assert np.allclose(y_ts.nnz, int((1 - (1 - 0.1 * 0.2) ** 10) * 100))
diff --git a/tests/experimental/optimizer/test_ops.py b/tests/experimental/optimizer/test_ops.py
index 2098a692..156c9651 100644
--- a/tests/experimental/optimizer/test_ops.py
+++ b/tests/experimental/optimizer/test_ops.py
@@ -200,21 +200,24 @@ def compute_graph_array(app: ArrayApplication, ga) -> BlockArray:
 
     app = app_inst_mock_small
     cluster_state = ClusterState(app.km.devices())
-    X = app.random.sparse_normal(shape=(10, 3), block_shape=(5, 3))
+    X = app.random.sparse_normal(shape=(10, 6), block_shape=(5, 3), p=0.1)
     Xc = X
-    theta: SparseBlockArray = SparseBlockArray.from_ba(
-        app.zeros((Xc.shape[1],), (Xc.block_shape[1],), dtype=Xc.dtype)
-    )
+    print(Xc.nnz)
+    theta: BlockArray = app.ones((Xc.shape[1],), (Xc.block_shape[1],), dtype=Xc.dtype)
     X_ga: GraphArray = GraphArray.from_ba(Xc, cluster_state)
     theta_ga: GraphArray = GraphArray.from_ba(theta, cluster_state)
     Z_ga: GraphArray = X_ga @ theta_ga
     Z_ga: GraphArray = collapse_graph_array(app, Z_ga)
+    # one_ga: GraphArray = GraphArray.from_ba(
+    #     BlockArray.from_scalar(1, app.km), cluster_state
+    # )
     one_ga: GraphArray = GraphArray.from_ba(
-        SparseBlockArray.from_scalar(1, app.km), cluster_state
+        app.ones((Xc.shape[0],), (Xc.block_shape[0],), dtype=Xc.dtype), cluster_state
     )
     mu_ga: GraphArray = collapse_graph_array(app, one_ga / (one_ga + app.exp(-Z_ga)))
-    mu_ba: SparseBlockArray = compute_graph_array(app, mu_ga)
-    print(mu_ba.todense().get())
+    mu_ba: BlockArray = compute_graph_array(app, mu_ga)
+    print(mu_ba.get())
+    assert np.allclose(1 / (1 + np.exp(-(X.to_ba().get() @ theta.get()))), mu_ba.get())
 
 
 def test_sparse_bop_2(app_inst_mock_small):
@@ -256,7 +259,9 @@ def compute_graph_array(app: ArrayApplication, ga) -> BlockArray:
     P = P_sba.to_ba().get()
     Q = Q_sba.to_ba().get()
     S = S_sba.to_ba().get()
-    assert np.allclose(S - P @ Q.T, X_sba.to_ba().get())
+    X = X_sba.to_ba().get()
+    print(X)
+    assert np.allclose(S - P @ Q.T, X)
 
 
 if __name__ == "__main__":
diff --git a/tests/experimental/optimizer/test_tensordot.py b/tests/experimental/optimizer/test_tensordot.py
index 7657b898..c06ae1ae 100644
--- a/tests/experimental/optimizer/test_tensordot.py
+++ b/tests/experimental/optimizer/test_tensordot.py
@@ -41,7 +41,7 @@ def optimized_tensordot(
     cluster_state: ClusterState = ClusterState(lhs.km.devices())
     lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state, copy_on_op=copy_on_op)
     rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state, copy_on_op=copy_on_op)
-    tensordot_ga = lhs_ga.tensordot(rhs_ga, axes=axes)
+    tensordot_ga = GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes)
     global random_state
     print("*" * 50)
     print("op grid shape", tensordot_ga.grid.grid_shape)
@@ -119,7 +119,7 @@ def test_load_sqr(app_inst_mock_big):
     cluster_state: ClusterState = ClusterState(app.km.devices())
     lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state)
     rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state)
-    tensordot_ga = lhs_ga.tensordot(rhs_ga, axes=axes)
+    tensordot_ga = GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes)
 
     mem_diff = max(cluster_state.resources[0]) - min(cluster_state.resources[0])
     net_in_diff = max(cluster_state.resources[1]) - min(cluster_state.resources[1])
@@ -163,7 +163,7 @@ def test_load_single_block_rhs(app_inst_mock_big):
     cluster_state: ClusterState = ClusterState(app.km.devices())
     lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state)
     rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state)
-    tensordot_ga = lhs_ga.tensordot(rhs_ga, axes=axes)
+    tensordot_ga = GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes)
 
     print("memory", cluster_state.resources[0])
     print("net_in", cluster_state.resources[1])