Minor fix

2025-06-26 23:15:49 +00:00 · 2025-03-10 13:02:02 +08:00 · 2025-03-10 13:02:02 +08:00 · bed67b234c
commit bed67b234c
parent ed278eddd3
1 changed files with 2 additions and 2 deletions
--- a/deep_gemm/jit_kernels/gemm.py
+++ b/deep_gemm/jit_kernels/gemm.py
@ -105,8 +105,8 @@ def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int,
    # NOTES: less L2 cache usage and less GPU frequency drop
    num_waves = get_num_waves(best_block_m, best_block_n)
    num_min_sms = ceil_div(ceil_div(m, best_block_m) * ceil_div(n, best_block_n) * num_groups, num_waves)
-    num_min_sms = ceil_div(max(num_min_sms, num_sms - 8), 2) * 2
-    assert num_min_sms <= num_sms
+    num_min_sms = ceil_div(max(num_min_sms, num_sms - 8), best_num_tma_multicast) * best_num_tma_multicast
+    assert num_min_sms <= num_sms and is_tma_multicast_legal(n, best_block_n, best_num_tma_multicast, num_min_sms)

    return num_min_sms, best_block_m, best_block_n, best_num_stages, best_num_tma_multicast, best_smem_size