Remove the low-latency usage flag (#214)

2025-06-26 18:28:11 +00:00 · 2025-06-16 13:30:14 +08:00
parent 1b92be8a71
commit 8aaddf76ae
6 changed files with 15 additions and 69 deletions
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -457,19 +457,6 @@ class Buffer:
            async_finish, allocate_on_comm_stream)
        return combined_x, combined_topk_weights, EventOverlap(event)

-    def get_low_latency_usage_flag(self):
-        """
-        Return a host-side integer flag, which indicates the stages of low-latency kernels.
-        The initial value is 0, the low-latency dispatch will add 1 before communication, the low-latency combine
-            will add 1 after communication.
-        This is useful when there is no two-batch overlap, and you want to overlap H2D/D2H transfer with attention layers.
-
-        Returns:
-            flag: the host-side integer flag pointer. The value is in `int`, but returns a `uint64_t` pointer. Please
-                `reinterpret_cast` the returned value into `int*`.
-        """
-        return self.runtime.get_low_latency_usage_flag()
-
    def clean_low_latency_buffer(self, num_max_dispatch_tokens_per_rank: int, hidden: int, num_experts: int) -> None:
        """
        As low-latency kernels require part of the buffer to be zero-initialized, so it is vital to clean the buffer