From 15f3897667c7da36f5b71bb5eb2867a4ed215214 Mon Sep 17 00:00:00 2001 From: Shengyu Liu Date: Tue, 22 Apr 2025 16:46:48 +0800 Subject: [PATCH] Update comment --- csrc/kernels/splitkv_mla.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/kernels/splitkv_mla.cu b/csrc/kernels/splitkv_mla.cu index 950e594..0333605 100644 --- a/csrc/kernels/splitkv_mla.cu +++ b/csrc/kernels/splitkv_mla.cu @@ -938,6 +938,7 @@ __forceinline__ __device__ void wg1_subroutine( // We put the `cute::warpgroup_wait<0>()` out of the `if` statement above, otherwise // nvcc cannot correctly analyse the loop, and will think that we are using accumulator // registers during the WGMMA pipeline, which results in `WARPGROUP.ARRIVE` and `WARPGROUP.DEPBAR.LE` being inserted in SASS and WGMMA instructions being serialized. + // This is also the reason why we put QK^T here, instead of the first operation in the loop cute::warpgroup_wait<0>(); }