Fully remove barrier FIFO designs (#200)

* Fully remove FIFO slots

* Fully remove FIFO buffers

* Minor fix styles

* Fix some typos

* Bugs fixed

* Cleanup `ibgda_poll_cq`
This commit is contained in:
Chenggang Zhao
2025-06-10 16:23:20 +08:00
committed by GitHub
parent a16af40531
commit 8da2d7b38d
10 changed files with 121 additions and 181 deletions

View File

@@ -396,44 +396,32 @@ __forceinline__ __device__ int get_lane_id() {
return lane_id;
}
template <int kNumRanks>
__forceinline__ __device__ void move_fifo_slots(int &head) {
head = (head + kNumRanks) % NUM_MAX_FIFO_SLOTS;
}
template <int kNumRanks>
__device__ __forceinline__ bool not_finished(int *task, int expected) {
auto result = false;
auto lane_id = threadIdx.x % 32;
if (lane_id < kNumRanks)
result = ld_volatile_global(task + lane_id) != expected;
return __any_sync(0xffffffff, result);
}
template <int kNumRanks>
__forceinline__ __device__ void
timeout_check(int **task_fifo_ptrs, int head, int rank, int expected, int tag = 0) {
barrier_block(int** barrier_signal_ptrs, int rank) {
auto thread_id = static_cast<int>(threadIdx.x);
// Add self-ranks, sub other ranks
if (thread_id < kNumRanks) {
atomicAdd_system(barrier_signal_ptrs[rank] + thread_id, FINISHED_SUM_TAG);
memory_fence();
atomicSub_system(barrier_signal_ptrs[thread_id] + rank, FINISHED_SUM_TAG);
}
EP_DEVICE_ASSERT(kNumRanks <= blockDim.x);
// Check timeout
auto start_time = clock64();
while (not_finished<kNumRanks>(task_fifo_ptrs[rank] + head, expected)) {
if (clock64() - start_time > NUM_TIMEOUT_CYCLES and threadIdx.x == 0) {
printf("DeepEP timeout check failed: %d (rank = %d)\n", tag, rank);
while (true) {
auto value = thread_id < kNumRanks ? ld_volatile_global(barrier_signal_ptrs[rank] + thread_id) : 0;
if (__all_sync(0xffffffff, value <= 0))
break;
if (clock64() - start_time > NUM_TIMEOUT_CYCLES and get_lane_id() == 0) {
printf("DeepEP timeout check failed: rank = %d, thread = %d)\n", rank, thread_id);
trap();
}
}
}
template <int kNumRanks>
__forceinline__ __device__ void
barrier_device(int **task_fifo_ptrs, int head, int rank, int tag = 0) {
auto thread_id = static_cast<int>(threadIdx.x);
EP_DEVICE_ASSERT(kNumRanks <= 32);
if (thread_id < kNumRanks) {
atomicAdd_system(task_fifo_ptrs[rank] + head + thread_id, FINISHED_SUM_TAG);
memory_fence();
atomicSub_system(task_fifo_ptrs[thread_id] + head + rank, FINISHED_SUM_TAG);
}
timeout_check<kNumRanks>(task_fifo_ptrs, head, rank, 0, tag);
__syncthreads();
}
} // namespace deep_ep