From c0da8eaba564b639bd65fde8cd55b5bbecdeb3ef Mon Sep 17 00:00:00 2001 From: Chenggang Zhao Date: Mon, 23 Jun 2025 15:41:49 +0800 Subject: [PATCH] Add sender timeout checks --- csrc/kernels/internode.cu | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/csrc/kernels/internode.cu b/csrc/kernels/internode.cu index 93866bd..6a59c8c 100644 --- a/csrc/kernels/internode.cu +++ b/csrc/kernels/internode.cu @@ -475,7 +475,7 @@ dispatch(int4* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv // Acquire a tail int rdma_tail_idx = -1; if (is_token_in_rank_uint64 != 0) { - do { + while (true) { // Acquire lock first acquire_lock(rdma_send_channel_lock + lane_id); @@ -495,11 +495,21 @@ dispatch(int4* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv // Release lock release_lock(rdma_send_channel_lock + lane_id); - } while (rdma_tail_idx == -1); + break; + } // Wait the remote buffer to be released - while (rdma_tail_idx - cached_rdma_channel_head >= num_max_rdma_chunked_recv_tokens) + auto start_time = clock64(); + while (rdma_tail_idx - cached_rdma_channel_head >= num_max_rdma_chunked_recv_tokens) { cached_rdma_channel_head = static_cast(ld_volatile_global(rdma_channel_head.buffer(lane_id))); + + // Timeout check + if (clock64() - start_time >= NUM_TIMEOUT_CYCLES) { + printf("DeepEP dispatch RDMA sender timeout, channel: %d, RDMA: %d, nvl: %d, dst RDMA lane: %d, head: %d, tail: %d\n", + channel_id, rdma_rank, nvl_rank, lane_id, cached_rdma_channel_head, rdma_tail_idx); + trap(); + } + } } __syncwarp();