Use one qp per sm for internode normal kernels (#181)

let the sender SM use the channel_id, and the receiver SM use channel_id + num_channels
This commit is contained in:
Zhicheng Wu
2025-06-13 14:37:59 +08:00
committed by GitHub
parent 21efbe9b48
commit 05df5554ff
2 changed files with 3 additions and 3 deletions

View File

@@ -225,7 +225,7 @@ def test_loop(local_rank: int, num_local_ranks: int):
ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
num_sms = 24
num_qps_per_rank = max(num_sms // 2, ll_num_experts // num_ranks if test_ll_compatibility else 0)
num_qps_per_rank = max(num_sms, ll_num_experts // num_ranks if test_ll_compatibility else 0)
buffer = deep_ep.Buffer(group, int(1e9), int(1e9), low_latency_mode=test_ll_compatibility,
num_qps_per_rank=num_qps_per_rank)