mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
Add automatic warp count control for low-latency kernels (#213)
* Add automatic warp count control for low-latency dispatch * Add automatic warp count control for low-latency combine * More assertions
This commit is contained in:
@@ -148,7 +148,8 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
|
||||
int num_topk, int num_experts, int rank, int num_ranks,
|
||||
bool use_fp8, bool round_scale, bool use_ue8m0,
|
||||
void* workspace, int* usage_flag,
|
||||
cudaStream_t stream, int phases);
|
||||
int num_device_sms, cudaStream_t stream,
|
||||
int phases);
|
||||
|
||||
void combine(void* combined_x,
|
||||
void* rdma_recv_x, int* rdma_recv_flag, void* rdma_send_x,
|
||||
@@ -158,7 +159,8 @@ void combine(void* combined_x,
|
||||
int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
|
||||
int num_topk, int num_experts, int rank, int num_ranks,
|
||||
void* workspace, int* usage_flag,
|
||||
cudaStream_t stream, int phases, bool zero_copy);
|
||||
int num_device_sms, cudaStream_t stream,
|
||||
int phases, bool zero_copy);
|
||||
|
||||
} // namespace internode_ll
|
||||
|
||||
|
||||
Reference in New Issue
Block a user