From 77f97f79bda051d991aa9681c04138007faa0366 Mon Sep 17 00:00:00 2001 From: Shangyan Zhou Date: Wed, 18 Jun 2025 09:23:25 +0800 Subject: [PATCH] Fix the tail loading issue. (#219) * Fix the tail loading issue. * Modify the sync offset. --- csrc/kernels/internode.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/kernels/internode.cu b/csrc/kernels/internode.cu index 4a33f17..a49c430 100644 --- a/csrc/kernels/internode.cu +++ b/csrc/kernels/internode.cu @@ -605,7 +605,7 @@ dispatch(int4* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv // Read progress auto synced_last_issued_tail = __shfl_sync(0xffffffff, last_issued_tail, dst_rdma_rank); - auto processed_tail = ld_acquire_cta(const_cast(rdma_send_channel_tail + dst_rdma_rank)); + auto processed_tail = __shfl_sync(0xffffffff, ld_acquire_cta(const_cast(rdma_send_channel_tail + dst_rdma_rank)), 0); auto num_tokens_processed = processed_tail - synced_last_issued_tail; if (num_tokens_processed != synced_num_tokens_to_send and num_tokens_processed < num_max_rdma_chunked_send_tokens) continue;