Cache output stride parameters in registers to reduce global loads

2025-06-26 18:15:54 +00:00 · 2025-02-23 18:45:40 -08:00
parent ccb208bcac
commit 46bafd9e03
1 changed files with 1 additions and 1 deletions
--- a/csrc/flash_fwd_mla_kernel.h
+++ b/csrc/flash_fwd_mla_kernel.h
@@ -28,7 +28,7 @@ constexpr auto getSmemLayoutK() {
    }
 }
-template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, typename elem_type = cutlass::bfloat16_t, int kHeadDimV_ = 0>
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, typename elem_type=cutlass::bfloat16_t, int kHeadDimV_ = 0>
 struct Flash_fwd_kernel_traits_mla {
    using Element = elem_type;
    using ElementAccum = float;