Support zero-copy for low-latency combine

2025-06-26 18:28:11 +00:00 · 2025-03-18 15:41:50 +08:00
parent 82dcf48fd3
commit dcaf73e5ff
7 changed files with 80 additions and 28 deletions
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -488,7 +488,7 @@ class Buffer:
            self.runtime.low_latency_dispatch(x, topk_idx,
                                              num_max_dispatch_tokens_per_rank, num_experts,
                                              use_fp8, async_finish, return_recv_hook)
-        handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, num_experts)
+        handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
        tensors_to_record = (x, topk_idx,
                             packed_recv_x, packed_recv_x_scales, packed_recv_count,
                             packed_recv_src_info, packed_recv_layout_range)
@@ -497,8 +497,8 @@ class Buffer:

    # noinspection PyTypeChecker
    def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor,
-                            handle: tuple, async_finish: bool = False, return_recv_hook: bool = False,
-                            out: Optional[torch.Tensor] = None) -> \
+                            handle: tuple, zero_copy: bool = False, async_finish: bool = False,
+                            return_recv_hook: bool = False, out: Optional[torch.Tensor] = None) -> \
            Tuple[torch.Tensor, EventOverlap, Callable]:
        """
        A low-latency implementation for combining tokens (reduce **with weights**) with IBGDA.
@@ -517,6 +517,8 @@ class Buffer:
            topk_weights: `[num_combined_tokens, num_topk]` with `torch.float`, the expert weights selected by the dispatched
                tokens. The received tokens will be reduced with the weights in this tensor.
            handle: the communication handle given by the `dispatch` function.
+            zero_copy: whether the tensor is already copied into the RDMA buffer, should be cooperative
+                with `get_next_low_latency_combine_buffer`.
            async_finish: the current stream will not wait for the communication kernels to be finished if set.
            return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
@@ -528,9 +530,24 @@ class Buffer:
            event: the event after executing the kernel (valid only if `async_finish` is set).
            hook: the receiving hook function (valid only if `return_recv_hook` is set).
        """
-        src_info, layout_range, num_max_dispatch_tokens_per_rank, num_experts = handle
+        src_info, layout_range, num_max_dispatch_tokens_per_rank, hidden, num_experts = handle
        combined_x, event, hook = self.runtime.low_latency_combine(x, topk_idx, topk_weights, src_info, layout_range,
                                                                   num_max_dispatch_tokens_per_rank, num_experts,
-                                                                   async_finish, return_recv_hook, out)
+                                                                   zero_copy, async_finish, return_recv_hook, out)
        tensors_to_record = (x, topk_idx, topk_weights, src_info, layout_range, combined_x)
        return combined_x, EventOverlap(event, tensors_to_record if async_finish else None), hook
+
+    def get_next_low_latency_combine_buffer(self, handle: object):
+        """
+        Get the raw registered RDMA buffer tensor for next low-latency combine, so that the next combine kernel can skip the copying.
+
+        Arguments:
+            handle: the communication handle given by the `dispatch` function.
+
+        Returns:
+            buffer: the raw RDMA low-latency buffer as a BF16 PyTorch tensor with shape
+                `[num_local_experts, num_ranks * num_max_dispatch_tokens_per_rank, hidden]`, you should fill this buffer
+                by yourself.
+        """
+        src_info, layout_range, num_max_dispatch_tokens_per_rank, hidden, num_experts = handle
+        return self.runtime.get_next_low_latency_combine_buffer(num_max_dispatch_tokens_per_rank, hidden, num_experts)