Weight gradient kernels for dense and MoE models (#95)

* Init weight gradient kernels. * Support unaligned n,k and gmem stride * Update docs * Several cleanups * Remove restrictions on N * Add stride(0) assertions --------- Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com>
2025-06-26 23:15:49 +00:00 · 2025-05-14 14:47:58 +08:00
parent d75b218b7b
commit 04278f6dee
12 changed files with 911 additions and 72 deletions
--- a/deep_gemm/jit_kernels/runtime.py
+++ b/deep_gemm/jit_kernels/runtime.py
@@ -87,45 +87,48 @@ def make_2d_tma_copy_desc(global_address: torch.Tensor,


 def make_2d_tma_desc(global_address: torch.Tensor, layout: Layout,
-                     gmem_rows: int, gmem_cols: int,
+                     gmem_rows: int, gmem_cols: int, gmem_stride: int,
                     smem_rows: int, smem_cols: int,
                     swizzle_type: cbd.CUtensorMapSwizzle = cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B) -> cbd.CUtensorMap:
    if layout == Layout.RowMajor:
        gmem_dim = (cbd.cuuint64_t(gmem_cols), cbd.cuuint64_t(gmem_rows))
        smem_dim = (cbd.cuuint32_t(smem_cols), cbd.cuuint32_t(smem_rows))
-        return make_2d_tma_copy_desc(global_address, gmem_dim, cbd.cuuint64_t(gmem_cols * global_address.element_size()), smem_dim, swizzle_type)
+        return make_2d_tma_copy_desc(global_address, gmem_dim, cbd.cuuint64_t(gmem_stride * global_address.element_size()), smem_dim, swizzle_type)
    else:
        gmem_dim = (cbd.cuuint64_t(gmem_rows), cbd.cuuint64_t(gmem_cols))
        smem_dim = (cbd.cuuint32_t(smem_rows), cbd.cuuint32_t(smem_cols))
-        return make_2d_tma_copy_desc(global_address, gmem_dim, cbd.cuuint64_t(gmem_rows * global_address.element_size()), smem_dim, swizzle_type)
+        return make_2d_tma_copy_desc(global_address, gmem_dim, cbd.cuuint64_t(gmem_stride * global_address.element_size()), smem_dim, swizzle_type)


 def make_2d_tma_a_desc(gemm_type: GemmType, global_address: torch.Tensor,
                       shape_m: int, shape_k: int,
                       block_m: int, block_k: int,
-                       num_groups: int) -> cbd.CUtensorMap:
+                       num_groups: int, a_stride: int = 0) -> cbd.CUtensorMap:
+    a_stride = shape_k if a_stride == 0 else a_stride
    return make_2d_tma_desc(global_address, Layout.RowMajor,
-                            shape_m * (num_groups if gemm_type == GemmType.GroupedMasked else 1), shape_k,
+                            shape_m * (num_groups if gemm_type == GemmType.GroupedMasked else 1), shape_k, a_stride,
                            block_m, block_k)


 def make_2d_tma_b_desc(gemm_type: GemmType, global_address: torch.Tensor,
                       shape_k: int, shape_n: int,
                       block_k: int, block_n: int,
-                       num_groups: int) -> cbd.CUtensorMap:
+                       num_groups: int, b_stride: int = 0) -> cbd.CUtensorMap:
+    b_stride = shape_k if b_stride == 0 else b_stride
    return make_2d_tma_desc(global_address, Layout.ColMajor,
-                            shape_k, shape_n * (num_groups if gemm_type != GemmType.Normal else 1),
+                            shape_k, shape_n * (num_groups if gemm_type != GemmType.Normal else 1), b_stride,
                            block_k, block_n)


 def make_2d_tma_d_desc(gemm_type: GemmType, global_address: torch.Tensor,
                       shape_m: int, shape_n: int,
                       block_m: int, block_n: int,
-                       num_groups: int, swizzle_mode: int) -> cbd.CUtensorMap:
+                       num_groups: int, swizzle_mode: int, d_stride: int = 0) -> cbd.CUtensorMap:
    # Swizzling requires the inner box dim to be less or equal than `kSwizzleDMode`
    # bytes, so `BLOCK_N * sizeof(T) / kSwizzleDMode` TMA stores are required
+    d_stride = shape_n if d_stride == 0 else d_stride
    return make_2d_tma_desc(global_address, Layout.RowMajor,
-                            shape_m * (num_groups if gemm_type == GemmType.GroupedMasked else 1), shape_n,
+                            shape_m * (num_groups if gemm_type == GemmType.GroupedMasked else 1), shape_n, d_stride,
                            block_m, block_n if swizzle_mode == 0 else swizzle_mode // global_address.element_size(),
                            swizzle_type_map[swizzle_mode])

@@ -136,10 +139,20 @@ def make_2d_tma_scales_a_desc(gemm_type: GemmType, global_address: torch.Tensor,
    shape_m = (shape_m + tma_alignment - 1) // tma_alignment * tma_alignment

    return make_2d_tma_desc(global_address, Layout.ColMajor,
-                            shape_m, (shape_k + block_k - 1) // block_k * (num_groups if gemm_type == GemmType.GroupedMasked else 1),
+                            shape_m, (shape_k + block_k - 1) // block_k * (num_groups if gemm_type == GemmType.GroupedMasked else 1), shape_m,
                            block_m, 1, cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE)


+def make_2d_tma_scales_b_desc(gemm_type: GemmType, global_address: torch.Tensor, shape_n: int, shape_k: int, block_n: int, block_k: int, num_groups: int = 1) -> cbd.CUtensorMap:
+    # Make TMA aligned to 16 bytes
+    tma_alignment = 16 / global_address.element_size()
+    shape_n = (shape_n + tma_alignment - 1) // tma_alignment * tma_alignment
+
+    return make_2d_tma_desc(global_address, Layout.ColMajor,
+                            shape_n, (shape_k + block_k - 1) // block_k * (num_groups if gemm_type == GemmType.GroupedMasked else 1), shape_n,
+                            block_n, 1, cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE)
+
+
 class FP8GemmRuntime(Runtime):
    def __init__(self, path: str) -> None:
        super().__init__(path, [
@@ -254,3 +267,111 @@ static void __instantiate_kernel() {{
            None,
        )
        return cbd.cuLaunchKernelEx(config, kernel, (arg_values, arg_types), 0)
+
+
+class FP8WGradGemmRuntime(Runtime):
+    def __init__(self, path: str) -> None:
+        super().__init__(path, [
+            'NUM_TMA_MULTICAST',
+            'K',
+            'BLOCK_M',
+            'GMEM_D',
+            'NUM_SMS',
+            'SMEM_SIZE',
+            'TENSOR_MAP_A',
+            'TENSOR_MAP_B',
+            'TENSOR_MAP_SCALES_A',
+            'TENSOR_MAP_SCALES_B',
+            'TENSOR_MAP_D',
+            'STREAM',
+        ])
+
+    @staticmethod
+    def generate(**kwargs) -> str:
+        code = f'''
+#ifdef __CUDACC_RTC__
+#include <deep_gemm/nvrtc_std.cuh>
+#else
+#include <cuda.h>
+#include <string>
+#endif
+
+#include <cuda_bf16.h>
+#include <cuda_fp8.h>
+
+#include <deep_gemm/fp8_wgrad_gemm.cuh>
+
+using namespace deep_gemm;
+
+static void __instantiate_kernel() {{
+    auto ptr = reinterpret_cast<void*>(&fp8_wgrad_gemm_kernel<
+        {kwargs['M']},
+        {kwargs['N']},
+        {kwargs['BLOCK_M']},
+        {kwargs['BLOCK_N']},
+        {kwargs['BLOCK_K']},
+        {kwargs['NUM_STAGES']},
+        {kwargs['LAST_STAGES']},
+        {kwargs['NUM_TMA_THREADS']},
+        {kwargs['NUM_MATH_THREADS_PER_GROUP']},
+        {kwargs['NUM_TMA_MULTICAST']},
+        {'true' if kwargs['IS_TMA_MULTICAST_ON_A'] else 'false'}
+      >);
+}};
+'''
+        if int(os.getenv('DG_JIT_DEBUG', 0)):
+            print(f'Generated FP8 WGrad GEMM code:\n{code}')
+        return code
+
+    # noinspection PyMethodOverriding
+    @staticmethod
+    def launch(kernel: cbd.CUkernel, num_tma_multicast: int, shape_k: int,
+               block_m: int, gmem_d: torch.Tensor, num_sms: int, smem_size: int,
+               tensor_map_a: cbd.CUtensorMap, tensor_map_b: cbd.CUtensorMap,
+               tensor_map_scales_a: cbd.CUtensorMap, tensor_map_scales_b: cbd.CUtensorMap,
+               tensor_map_d: cbd.CUtensorMap,
+               stream: cbd.CUstream) -> cbd.CUresult:
+        num_tma_threads = 128
+        num_math_threads_per_group = 128
+
+        res = cbd.cuKernelSetAttribute(cbd.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem_size, kernel, cbd.CUdevice(gmem_d.device.index))[0]
+        if res != cbd.CUresult.CUDA_SUCCESS:
+            raise Exception(f'Failed to set max dynamic shared memory size: {res}')
+
+        attr_val = cbd.CUlaunchAttributeValue()
+        attr_val.clusterDim.x = num_tma_multicast
+        attr_val.clusterDim.y = 1
+        attr_val.clusterDim.z = 1
+        attr = cbd.CUlaunchAttribute()
+        attr.id = cbd.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+        attr.value = attr_val
+
+        config = cbd.CUlaunchConfig()
+        config.numAttrs = 1
+        config.attrs = [attr]
+        config.gridDimX = num_sms
+        config.gridDimY = 1
+        config.gridDimZ = 1
+        config.blockDimX = get_num_threads_per_sm(num_tma_threads, num_math_threads_per_group, block_m)
+        config.blockDimY = 1
+        config.blockDimZ = 1
+        config.sharedMemBytes = smem_size
+        config.hStream = stream
+
+        arg_values = (
+            shape_k,
+            tensor_map_a,
+            tensor_map_b,
+            tensor_map_scales_a,
+            tensor_map_scales_b,
+            tensor_map_d,
+        )
+        arg_types = (
+            ctypes.c_uint32,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+        return cbd.cuLaunchKernelEx(config, kernel, (arg_values, arg_types), 0)