mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
Fully remove barrier FIFO designs (#200)
* Fully remove FIFO slots * Fully remove FIFO buffers * Minor fix styles * Fix some typos * Bugs fixed * Cleanup `ibgda_poll_cq`
This commit is contained in:
@@ -396,44 +396,32 @@ __forceinline__ __device__ int get_lane_id() {
|
||||
return lane_id;
|
||||
}
|
||||
|
||||
template <int kNumRanks>
|
||||
__forceinline__ __device__ void move_fifo_slots(int &head) {
|
||||
head = (head + kNumRanks) % NUM_MAX_FIFO_SLOTS;
|
||||
}
|
||||
|
||||
template <int kNumRanks>
|
||||
__device__ __forceinline__ bool not_finished(int *task, int expected) {
|
||||
auto result = false;
|
||||
auto lane_id = threadIdx.x % 32;
|
||||
if (lane_id < kNumRanks)
|
||||
result = ld_volatile_global(task + lane_id) != expected;
|
||||
return __any_sync(0xffffffff, result);
|
||||
}
|
||||
|
||||
template <int kNumRanks>
|
||||
__forceinline__ __device__ void
|
||||
timeout_check(int **task_fifo_ptrs, int head, int rank, int expected, int tag = 0) {
|
||||
barrier_block(int** barrier_signal_ptrs, int rank) {
|
||||
auto thread_id = static_cast<int>(threadIdx.x);
|
||||
|
||||
// Add self-ranks, sub other ranks
|
||||
if (thread_id < kNumRanks) {
|
||||
atomicAdd_system(barrier_signal_ptrs[rank] + thread_id, FINISHED_SUM_TAG);
|
||||
memory_fence();
|
||||
atomicSub_system(barrier_signal_ptrs[thread_id] + rank, FINISHED_SUM_TAG);
|
||||
}
|
||||
EP_DEVICE_ASSERT(kNumRanks <= blockDim.x);
|
||||
|
||||
// Check timeout
|
||||
auto start_time = clock64();
|
||||
while (not_finished<kNumRanks>(task_fifo_ptrs[rank] + head, expected)) {
|
||||
if (clock64() - start_time > NUM_TIMEOUT_CYCLES and threadIdx.x == 0) {
|
||||
printf("DeepEP timeout check failed: %d (rank = %d)\n", tag, rank);
|
||||
while (true) {
|
||||
auto value = thread_id < kNumRanks ? ld_volatile_global(barrier_signal_ptrs[rank] + thread_id) : 0;
|
||||
if (__all_sync(0xffffffff, value <= 0))
|
||||
break;
|
||||
|
||||
if (clock64() - start_time > NUM_TIMEOUT_CYCLES and get_lane_id() == 0) {
|
||||
printf("DeepEP timeout check failed: rank = %d, thread = %d)\n", rank, thread_id);
|
||||
trap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int kNumRanks>
|
||||
__forceinline__ __device__ void
|
||||
barrier_device(int **task_fifo_ptrs, int head, int rank, int tag = 0) {
|
||||
auto thread_id = static_cast<int>(threadIdx.x);
|
||||
EP_DEVICE_ASSERT(kNumRanks <= 32);
|
||||
|
||||
if (thread_id < kNumRanks) {
|
||||
atomicAdd_system(task_fifo_ptrs[rank] + head + thread_id, FINISHED_SUM_TAG);
|
||||
memory_fence();
|
||||
atomicSub_system(task_fifo_ptrs[thread_id] + head + rank, FINISHED_SUM_TAG);
|
||||
}
|
||||
timeout_check<kNumRanks>(task_fifo_ptrs, head, rank, 0, tag);
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
} // namespace deep_ep
|
||||
|
||||
Reference in New Issue
Block a user