mirror of
https://github.com/deepseek-ai/DeepGEMM
synced 2025-05-05 23:34:22 +00:00
Optimize performance
This commit is contained in:
parent
24517316af
commit
6b53c65c04
@ -135,7 +135,9 @@ struct Scheduler {
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
// NOTES: we don't have to set `is_peer_cta_alive` for masked grouped GEMM, as it must be aligned
|
// NOTES: we don't have to set `is_peer_cta_alive` for masked grouped GEMM, as it must be aligned
|
||||||
is_peer_cta_alive = (next_block_idx ^ 1) < num_blocks;
|
is_peer_cta_alive = kNumNBlocks % kNumTMAMulticast == 0 or // Always aligned on N (constant bypass)
|
||||||
|
num_aligned_m_blocks % kNumTMAMulticast == 0 or // Always aligned on M (constant bypass)
|
||||||
|
(next_block_idx ^ 1) < num_blocks; // Peer CTA in bound
|
||||||
get_swizzled_block_idx(num_aligned_m_blocks, next_block_idx, m_block_idx, n_block_idx);
|
get_swizzled_block_idx(num_aligned_m_blocks, next_block_idx, m_block_idx, n_block_idx);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
Loading…
Reference in New Issue
Block a user