Optimize intranode combine. (#247)

* Increase the test round. * Add warp synchronization. * Shuffle the send warps. * Add time elapsed into bench result.
2025-06-26 18:28:11 +00:00 · 2025-06-24 09:10:23 +08:00
parent fbcf430006
commit 9eb2f84b3e
3 changed files with 8 additions and 8 deletions
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -80,7 +80,7 @@ def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor, num_gro
    return (scores * mask).view(num_tokens, num_experts)


-def bench(fn, num_warmups: int = 20, num_tests: int = 30, post_fn=None):
+def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None):
    # Flush L2 cache with 256 MB data
    torch.cuda.synchronize()
    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')