Optimize performance

This commit is contained in:
Chenggang Zhao 2025-04-23 14:34:14 +08:00
parent 24517316af
commit 6b53c65c04

View File

@ -135,7 +135,9 @@ struct Scheduler {
return false;
// NOTES: we don't have to set `is_peer_cta_alive` for masked grouped GEMM, as it must be aligned
is_peer_cta_alive = (next_block_idx ^ 1) < num_blocks;
is_peer_cta_alive = kNumNBlocks % kNumTMAMulticast == 0 or // Always aligned on N (constant bypass)
num_aligned_m_blocks % kNumTMAMulticast == 0 or // Always aligned on M (constant bypass)
(next_block_idx ^ 1) < num_blocks; // Peer CTA in bound
get_swizzled_block_idx(num_aligned_m_blocks, next_block_idx, m_block_idx, n_block_idx);
}
return true;