diff --git a/csrc/kernels/intranode.cu b/csrc/kernels/intranode.cu index 0f3cb7e..23e02ba 100644 --- a/csrc/kernels/intranode.cu +++ b/csrc/kernels/intranode.cu @@ -41,6 +41,7 @@ notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, for (int i = 0; i < num_experts_per_rank; ++ i) per_expert_buffer[rank * num_experts_per_rank + i] = num_tokens_per_expert[thread_id * num_experts_per_rank + i]; } + memory_fence(); __syncthreads(); // Wait for all ranks to be finished diff --git a/csrc/kernels/utils.cuh b/csrc/kernels/utils.cuh index ac97896..9b24f04 100644 --- a/csrc/kernels/utils.cuh +++ b/csrc/kernels/utils.cuh @@ -446,7 +446,6 @@ barrier_block(int** barrier_signal_ptrs, int rank) { // Add self-ranks, sub other ranks if (thread_id < kNumRanks) { atomicAdd_system(barrier_signal_ptrs[rank] + thread_id, FINISHED_SUM_TAG); - memory_fence(); atomicSub_system(barrier_signal_ptrs[thread_id] + rank, FINISHED_SUM_TAG); } EP_DEVICE_ASSERT(kNumRanks <= blockDim.x);