mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-05-05 20:44:48 +00:00
Bugs fixed
This commit is contained in:
parent
592296cd45
commit
680e424bdc
@ -282,6 +282,7 @@ For two micro-batch overlapping, you can refer to the following figure. With our
|
||||
|
||||
## Roadmap
|
||||
|
||||
- [ ] AR support (releasing soon)
|
||||
- [ ] A100 support (intranode only)
|
||||
- [ ] Support BF16 for the low-latency dispatch kernel
|
||||
- [ ] Support NVLink protocol for intranode low-latency kernels
|
||||
|
@ -383,8 +383,7 @@ notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, in
|
||||
|
||||
// Calculate prefix sum
|
||||
__syncthreads();
|
||||
EP_STATIC_ASSERT(kNumRDMARanks <= 32, "Invalid number of RDMA ranks");
|
||||
if (thread_id < kNumRDMARanks) {
|
||||
if (thread_id == 0) {
|
||||
auto prefix_row = rdma_channel_prefix_matrix + dst_rdma_rank * num_channels;
|
||||
#pragma unroll
|
||||
for (int i = 1; i < num_channels; ++ i)
|
||||
|
Loading…
Reference in New Issue
Block a user