Optimize expression

2025-06-26 23:15:49 +00:00 · 2025-04-14 10:26:33 +08:00 · 2025-04-14 10:26:33 +08:00 · 6366f5ad1a
commit 6366f5ad1a
parent 9406e2a3a1
1 changed files with 3 additions and 1 deletions
--- a/deep_gemm/include/deep_gemm/fp8_gemm.cuh
+++ b/deep_gemm/include/deep_gemm/fp8_gemm.cuh
@ -386,7 +386,9 @@ fp8_gemm_kernel(__nv_bfloat16* gmem_d, float* scales_b, int* grouped_layout,
                        // Reshape the atom in another view and swizzle
                        //  - original: `(WGMMA_M_PER_WARP, kSwizzleDMode / kNumBankGroupBytes)`
                        //  - new: `(WGMMA_M_PER_WARP * kSwizzleDMode / kNumBankGroupBytes / 8, 8)`
-                        auto row = bank_group_index / 8, col = bank_group_index % 8;
+                        constexpr bool kHasShortcut = (kSwizzleDMode / kNumBankGroupBytes) == 8;
+                        auto row = kHasShortcut ? (in_atom_offset / 8 + lane_idx) : (bank_group_index / 8);
+                        auto col = kHasShortcut ? (in_atom_offset) : (bank_group_index % 8);
                        col ^= row % (kSwizzleDMode / 16);

                        // Add back into the base pointer