Optimize performance

2025-05-05 20:04:22 +00:00 · 2025-04-23 14:34:14 +08:00 · 2025-04-23 14:34:14 +08:00 · 6b53c65c04
commit 6b53c65c04
parent 24517316af
1 changed files with 3 additions and 1 deletions
--- a/deep_gemm/include/deep_gemm/scheduler.cuh
+++ b/deep_gemm/include/deep_gemm/scheduler.cuh
@ -135,7 +135,9 @@ struct Scheduler {
                return false;

            // NOTES: we don't have to set `is_peer_cta_alive` for masked grouped GEMM, as it must be aligned
-            is_peer_cta_alive = (next_block_idx ^ 1) < num_blocks;
+            is_peer_cta_alive = kNumNBlocks % kNumTMAMulticast == 0 or          // Always aligned on N (constant bypass)
+                                num_aligned_m_blocks % kNumTMAMulticast == 0 or // Always aligned on M (constant bypass)
+                                (next_block_idx ^ 1) < num_blocks;              // Peer CTA in bound
            get_swizzled_block_idx(num_aligned_m_blocks, next_block_idx, m_block_idx, n_block_idx);
        }
        return true;