From 46bafd9e033137fc5b24470c0b82f47fe62bc3f6 Mon Sep 17 00:00:00 2001 From: Gareth Jones Date: Sun, 23 Feb 2025 18:45:40 -0800 Subject: [PATCH] Cache output stride parameters in registers to reduce global loads --- csrc/flash_fwd_mla_kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_fwd_mla_kernel.h b/csrc/flash_fwd_mla_kernel.h index 9265a1a..b78247d 100644 --- a/csrc/flash_fwd_mla_kernel.h +++ b/csrc/flash_fwd_mla_kernel.h @@ -28,7 +28,7 @@ constexpr auto getSmemLayoutK() { } } -template +template struct Flash_fwd_kernel_traits_mla { using Element = elem_type; using ElementAccum = float;