LBANN · szaman19 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025
diff --git a/.gitignore b/.gitignore
@@ -167,3 +167,27 @@ cython_debug/
 # Use wildcards as well
 *~
 *.o
+# Miscallenous files generated by DGraph data processing
+skbuild/
+.vscode/
+logs/
+torchrun_*
+*.png
+rdvz
+*.pt
+*.core
+*.graph
+*.out
+*.gz
+data_processed
+*.zip
+cache
+graph_cache
+*.nsys-rep
+*.nsys
+*.pth
+*.pyc
+*.npy
+*.npz
+*.sqlite
+*.csv
diff --git a/DGraph/distributed/Engine.py b/DGraph/distributed/Engine.py
@@ -50,7 +50,7 @@ def scatter(
         output_size: int,
         rank_mappings: Optional[torch.Tensor] = None,
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         raise NotImplementedError
 
@@ -60,7 +60,7 @@ def gather(
         indices: Union[torch.Tensor, torch.LongTensor],
         rank_mappings: Optional[torch.Tensor] = None,
         *args,
-        **kwargs
+        **kwargs,
     ) -> torch.Tensor:
         raise NotImplementedError
 

diff --git a/DGraph/distributed/RankLocalOps.py b/DGraph/distributed/RankLocalOps.py
@@ -16,9 +16,15 @@
 """
 
 import torch
+import torch.distributed as dist
 
 try:
-    from DGraph.torch_local import local_masked_gather, local_masked_scatter
+    from DGraph.torch_local import (
+        local_masked_gather,
+        local_masked_scatter,
+        local_masked_scatter_gather,
+        local_masked_scatter_add_gather,
+    )
 
     _LOCAL_OPT_KERNELS_AVAILABLE = True
 except ImportError:
@@ -81,6 +87,93 @@ def OptimizedRankLocalMaskedGather(
     return output
 
 
+def OptimizedLocalScatterGather(
+    src: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    output: torch.Tensor,
+):
+    """
+    Performs the operation
+
+    for i in range(len(src_indices)):
+        output[dst_indices[i]] = src[src_indices[i]]
+    Args:
+        src (torch.Tensor): Source tensor
+        src_indices (torch.Tensor): Source indices
+        dst_indices (torch.Tensor): Destination indices
+        output (torch.Tensor): Output tensor
+    Returns:
+        torch.Tensor: Output tensor after scatter-gather
+    """
+
+    if not _LOCAL_OPT_KERNELS_AVAILABLE:
+        warnings.warn(
+            "Optimized local kernels are not available. Falling back to the default implementation."
+        )
+        output[dst_indices] = src[src_indices]
+    else:
+        bs = src.shape[0]
+        num_src_rows = src.shape[1]
+        num_features = src.shape[-1]
+        num_output_rows = output.shape[1]
+        local_masked_scatter_gather(
+            src,
+            src_indices.cuda(),
+            dst_indices.cuda(),
+            output,
+            bs,
+            num_src_rows,
+            num_features,
+            num_output_rows,
+        )
+    return output
+
+
+def OptimizedLocalScatterSumGather(
+    src: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    output: torch.Tensor,
+):
+    """
+    Performs the operation
+
+    for i in range(len(src_indices)):
+        output[dst_indices[i]] += src[src_indices[i]]
+    Args:
+        src (torch.Tensor): Source tensor
+        src_indices (torch.Tensor): Source indices
+        dst_indices (torch.Tensor): Destination indices
+        output (torch.Tensor): Output tensor
+    Returns:
+        torch.Tensor: Output tensor after scatter-gather
+    """
+
+    if not _LOCAL_OPT_KERNELS_AVAILABLE:
+        warnings.warn(
+            "Optimized local kernels are not available. Falling back to the default implementation."
+        )
+        for i in range(src_indices.shape[0]):
+            output[:, dst_indices[i], :] += src[:, src_indices[i], :]
+    else:
+        bs = src.shape[0]
+        num_src_rows = src.shape[1]
+        num_features = src.shape[-1]
+        num_output_rows = output.shape[1]
+        local_masked_scatter_add_gather(
+            src,
+            src_indices.cuda(),
+            dst_indices.cuda(),
+            output,
+            bs,
+            num_src_rows,
+            num_features,
+            num_output_rows,
+        )
+    return output
+
+
 def OutOfPlaceRankLocalMaskedGather(
     _src: torch.Tensor, indices: torch.Tensor, rank_mapping: torch.Tensor, rank: int
 ) -> torch.Tensor:
@@ -140,7 +233,9 @@ def RankLocalRenumberingWithMapping(_indices, rank_mapping):
     unique_indices, inverse_indices = torch.unique(_indices, return_inverse=True)
     rank_mapping = rank_mapping.to(_indices.device)
     renumbered_indices = inverse_indices
-    unique_rank_mapping = torch.zeros_like(unique_indices, dtype=rank_mapping.dtype, device=rank_mapping.device)
+    unique_rank_mapping = torch.zeros_like(
+        unique_indices, dtype=rank_mapping.dtype, device=rank_mapping.device
+    )
     unique_rank_mapping.scatter_(0, inverse_indices, rank_mapping)
 
     return renumbered_indices, unique_indices, unique_rank_mapping

diff --git a/DGraph/distributed/csrc/local_data_kernels.cuh b/DGraph/distributed/csrc/local_data_kernels.cuh
@@ -251,4 +251,144 @@ namespace Local
       }
     }
   }
+
+
+
+  template <typename T>
+  struct FloatAtomicAddOp
+  {
+    __device__ __forceinline__ void operator()(T *cur_addr, const T new_val)
+    {
+      atomicAdd(cur_addr, new_val);
+    }
+  };
+
+  template <typename T>
+  struct FloatSetOp
+  {
+    __device__ __forceinline__ void operator()(T *cur_addr, const T new_val)
+    {
+      *cur_addr = new_val;
+    }
+  };
+
+
+  /**
+   *
+   * Masked Gather Kernel operation that performs the operation:
+    Y [mask[i]] = Op(Y [mask[i]], X [indices[i]])
+
+    where Y is the output matrix, X is the input matrix, indices is the index matrix, and mask is the mask matrix.
+   */
+
+  template <typename Op>
+  __global__ void Masked_Scatter_Gather_Kernel(
+      const float *__restrict__ values,
+      const long *__restrict__ indices,
+      const long *__restrict__ mask,
+      float *__restrict__ output,
+      const int mini_batch_size,
+      const int num_indices,
+      const int num_cols,
+      const int num_output_rows)
+  {
+    const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+    const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+
+    const size_t nthreadsx = gridDim.x * blockDim.x;
+    const size_t nthreadsy = gridDim.y * blockDim.y;
+    const size_t nthreadsz = gridDim.z * blockDim.z;
+
+    Op op;
+
+    for (size_t mb_i = gidz; mb_i < mini_batch_size; mb_i += nthreadsz)
+    {
+      const auto values_offset = mb_i * num_cols * num_indices;
+      const auto output_offset = mb_i * num_cols * num_output_rows;
+      const auto ind_offset = mb_i * num_indices;
+      const auto mask_offset = mb_i * num_indices;
+
+      for (size_t row = gidy; row < num_indices; row += nthreadsy)
+      {
+        const auto output_row = mask[mask_offset + row];
+        const auto input_row = indices[ind_offset + row];
+
+        for (size_t col = gidx; col < num_cols; col += nthreadsx)
+        {
+          auto *output_addr = &output[output_offset + output_row * num_cols + col];
+          const auto input_val = values[values_offset + input_row * num_cols + col];
+          op(output_addr, input_val);
+        }
+      }
+    }
+  }
+
+  /*
+   *
+   Optimized masked scatter gather kernel that performs the operation:
+    Y [mask[i]] = X [indices[i]]
+
+    This kernel is optimized for the case where the num_cols is a multiple of 4.
+
+    where Y is the output matrix, X is the input matrix, indices is the index matrix, and mask is the mask matrix.
+   */
+  template <typename Op>
+  __global__ void Optimized_Masked_Scatter_Gather_Kernel(
+      const float *__restrict__ values,
+      const long *__restrict__ indices,
+      const long *__restrict__ mask,
+      float *__restrict__ output,
+      const int mini_batch_size,
+      const int num_indices,
+      const int num_cols,
+      const int num_output_rows)
+  {
+    const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
+    const size_t gidz = threadIdx.z + blockIdx.z * blockDim.z;
+
+    const size_t nthreadsx = gridDim.x * blockDim.x;
+    const size_t nthreadsy = gridDim.y * blockDim.y;
+    const size_t nthreadsz = gridDim.z * blockDim.z;
+
+    // Grid-stride loop over mini-batches
+
+    Op binary_operator;
+    for (size_t mb_i = gidz; mb_i < mini_batch_size; mb_i += nthreadsz)
+    {
+      const auto values_offset = mb_i * num_cols / 4 * num_indices;
+      const auto output_offset = mb_i * num_cols / 4 * num_output_rows;
+      const auto ind_offset = mb_i * num_indices;
+      const auto mask_offset = mb_i * num_indices;
+
+      // Grid-stride loop over rows
+      for (size_t row = gidy; row < num_indices; row += nthreadsy)
+      {
+        long output_row, input_row;
+
+        if (threadIdx.x == 0)
+        {
+          output_row = mask[mask_offset + row];
+          input_row = indices[ind_offset + row];
+        }
+
+        output_row = __shfl_sync(0xFFFFFFFF, output_row, 0);
+        input_row = __shfl_sync(0xFFFFFFFF, input_row, 0);
+
+        output_row = mask[mask_offset + row];
+        input_row = indices[ind_offset + row];
+
+        size_t col = gidx;
+
+        for (; col < num_cols / 4; col += nthreadsx)
+        {
+          const float4 values_vec = reinterpret_cast<const float4 *>(values)[values_offset + input_row * num_cols / 4 + col];
+          float4* output_addr = &reinterpret_cast<float4 *>(output)[output_offset + output_row * num_cols / 4 + col];
+          binary_operator(output_addr, values_vec);
+        }
+      }
+    }
+  }
+
 } // namespace Local
diff --git a/DGraph/distributed/csrc/torch_local_bindings.cpp b/DGraph/distributed/csrc/torch_local_bindings.cpp
@@ -21,4 +21,6 @@ PYBIND11_MODULE(torch_local, m)
 {
   m.def("local_masked_gather", &local_masked_gather, "Masked Gather");
   m.def("local_masked_scatter", &local_masked_scatter, "Masked Scatter");
+  m.def("local_masked_scatter_gather", &local_masked_scatter_gather, "Masked Scatter Gather");
+  m.def("local_masked_scatter_add_gather", &local_masked_scatter_add_gather, "Masked Scatter Add Gather");
 }