diff --git a/csrc/kernels/internode.cu b/csrc/kernels/internode.cu index 4a33f17..a49c430 100644 --- a/csrc/kernels/internode.cu +++ b/csrc/kernels/internode.cu @@ -605,7 +605,7 @@ dispatch(int4* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv // Read progress auto synced_last_issued_tail = __shfl_sync(0xffffffff, last_issued_tail, dst_rdma_rank); - auto processed_tail = ld_acquire_cta(const_cast(rdma_send_channel_tail + dst_rdma_rank)); + auto processed_tail = __shfl_sync(0xffffffff, ld_acquire_cta(const_cast(rdma_send_channel_tail + dst_rdma_rank)), 0); auto num_tokens_processed = processed_tail - synced_last_issued_tail; if (num_tokens_processed != synced_num_tokens_to_send and num_tokens_processed < num_max_rdma_chunked_send_tokens) continue;