From ed3444bf9ba5da3a01150cb3546d343b6d6de36e Mon Sep 17 00:00:00 2001 From: Shangyan Zhou Date: Thu, 26 Jun 2025 17:58:49 +0800 Subject: [PATCH] Fix transcation window. (#260) --- csrc/kernels/internode.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/csrc/kernels/internode.cu b/csrc/kernels/internode.cu index 47f62ac..b83e9ae 100644 --- a/csrc/kernels/internode.cu +++ b/csrc/kernels/internode.cu @@ -560,19 +560,21 @@ dispatch(int4* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv if (is_token_in_rank_uint64 != 0) { // Acquire lock first acquire_lock(rdma_send_channel_lock + lane_id); - - // Release the transaction slot auto window = rdma_send_channel_window[lane_id]; auto latest_tail = rdma_send_channel_tail[lane_id]; auto offset = rdma_tail_idx - latest_tail; - // The same effect with `EP_DEVICE_ASSERT(offset < 32);` - EP_STATIC_ASSERT(kNumDispatchRDMASenderWarps < 32, "Invalid warps"); - + while (offset >= 32) { + release_lock(rdma_send_channel_lock + lane_id); + acquire_lock(rdma_send_channel_lock + lane_id); + latest_tail = rdma_send_channel_tail[lane_id]; + offset = rdma_tail_idx - latest_tail; + } + // Release the transaction slot // Erase bit and move the ones if possible window ^= 1u << offset; if (offset == 0) { - auto num_empty_slots = __ffs(~window) - 1; + auto num_empty_slots = (~window) == 0 ? 32 : __ffs(~window) - 1; st_release_cta(rdma_send_channel_tail + lane_id, latest_tail + num_empty_slots); window >>= num_empty_slots; }