From f1d7a7c89f4d774c43a728fd4ef6fac6c6429fc4 Mon Sep 17 00:00:00 2001 From: Shangyan Zhou Date: Tue, 24 Jun 2025 17:51:28 +0800 Subject: [PATCH] Remove memory fence in NVLink barrier. --- csrc/kernels/intranode.cu | 1 + csrc/kernels/utils.cuh | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/kernels/intranode.cu b/csrc/kernels/intranode.cu index 0f3cb7e..23e02ba 100644 --- a/csrc/kernels/intranode.cu +++ b/csrc/kernels/intranode.cu @@ -41,6 +41,7 @@ notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, for (int i = 0; i < num_experts_per_rank; ++ i) per_expert_buffer[rank * num_experts_per_rank + i] = num_tokens_per_expert[thread_id * num_experts_per_rank + i]; } + memory_fence(); __syncthreads(); // Wait for all ranks to be finished diff --git a/csrc/kernels/utils.cuh b/csrc/kernels/utils.cuh index ac97896..9b24f04 100644 --- a/csrc/kernels/utils.cuh +++ b/csrc/kernels/utils.cuh @@ -446,7 +446,6 @@ barrier_block(int** barrier_signal_ptrs, int rank) { // Add self-ranks, sub other ranks if (thread_id < kNumRanks) { atomicAdd_system(barrier_signal_ptrs[rank] + thread_id, FINISHED_SUM_TAG); - memory_fence(); atomicSub_system(barrier_signal_ptrs[thread_id] + rank, FINISHED_SUM_TAG); } EP_DEVICE_ASSERT(kNumRanks <= blockDim.x);