mirror of
https://github.com/deepseek-ai/FlashMLA
synced 2025-06-26 18:15:54 +00:00
Update comment
This commit is contained in:
parent
9352b7a790
commit
15f3897667
@ -938,6 +938,7 @@ __forceinline__ __device__ void wg1_subroutine(
|
||||
// We put the `cute::warpgroup_wait<0>()` out of the `if` statement above, otherwise
|
||||
// nvcc cannot correctly analyse the loop, and will think that we are using accumulator
|
||||
// registers during the WGMMA pipeline, which results in `WARPGROUP.ARRIVE` and `WARPGROUP.DEPBAR.LE` being inserted in SASS and WGMMA instructions being serialized.
|
||||
// This is also the reason why we put QK^T here, instead of the first operation in the loop
|
||||
cute::warpgroup_wait<0>();
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user