Init weight gradient kernels.

2025-06-26 23:15:49 +00:00 · 2025-05-06 17:16:27 +08:00
parent d374456787
commit d5470d3b4e
9 changed files with 841 additions and 35 deletions
--- a/deep_gemm/jit_kernels/init.py
+++ b/deep_gemm/jit_kernels/init.py
@@ -3,6 +3,10 @@ from .m_grouped_gemm import (
    m_grouped_gemm_fp8_fp8_bf16_nt_contiguous,
    m_grouped_gemm_fp8_fp8_bf16_nt_masked
 )
+from .wgrad_gemm import (
+    wgrad_gemm_fp8_fp8_fp32_nt,
+    k_grouped_wgrad_gemm_fp8_fp8_fp32_nt
+)
 from .utils import (
    ceil_div, set_num_sms, get_num_sms,
    get_col_major_tma_aligned_tensor,
--- a/deep_gemm/jit_kernels/gemm.py
+++ b/deep_gemm/jit_kernels/gemm.py
@@ -61,16 +61,20 @@ def get_block_n_padding_for_smem_d(block_n: int) -> int:
    return (((padding + requirement[1]) if padding < 0 else padding) * 4) // elem_size


-def get_smem_config(num_stages: int, k: int, block_m: int, block_n: int, block_k: int = 128) -> Tuple[int, int, int]:
+def get_smem_config(num_stages: int, k: int, block_m: int, block_n: int, block_k: int = 128,        
+                    is_fp32_out: bool = False, is_wgrad: bool = False) -> Tuple[int, int, int]:
    # Try swizzle first, as it does not waste shared memory
    swizzle_mode = get_swizzle_mode(block_n)
    block_n_padding = get_block_n_padding_for_smem_d(block_n) if swizzle_mode == 0 else 0

-    smem_d = block_m * (block_n + block_n_padding) * 2
+    smem_d = block_m * (block_n + block_n_padding) * (4 if is_fp32_out else 2)
    smem_a_per_stage = block_m * block_k
    smem_scales_a_per_stage = block_m * 4
    smem_b_per_stage = block_n * block_k
-    smem_scales_b = ceil_div(k, block_k) * 4
+    if is_wgrad:
+        smem_scales_b_per_stage = ceil_div(block_n * 4, 128) * 128
+    else:
+        smem_scales_b = ceil_div(k, block_k) * 4
    smem_barrier = num_stages * 8 * 2

    smem_size = 0
@@ -78,7 +82,10 @@ def get_smem_config(num_stages: int, k: int, block_m: int, block_n: int, block_k
    smem_size += num_stages * smem_a_per_stage
    smem_size += num_stages * smem_scales_a_per_stage
    smem_size += num_stages * smem_b_per_stage
-    smem_size += ceil_div(smem_scales_b * (1 if block_k % block_n == 0 else 2), 8) * 8
+    if is_wgrad:
+        smem_size += num_stages * smem_scales_b_per_stage
+    else:
+        smem_size += ceil_div(smem_scales_b * (1 if block_k % block_n == 0 else 2), 8) * 8
    smem_size += smem_barrier

    # Swizzle and padding are not compatible
@@ -89,13 +96,18 @@ def get_smem_config(num_stages: int, k: int, block_m: int, block_n: int, block_k

@lru_cache(maxsize=None)
 def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int,
-                     is_grouped_contiguous: bool = False, is_grouped_masked: bool = False) -> \
+                     is_grouped_contiguous: bool = False, is_grouped_masked: bool = False,
+                     is_fp32_out: bool = False, is_wgrad: bool = False) -> \
        Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]]:
    if not is_grouped_contiguous:
-        block_ms = (64, 128, 256)
+        block_ms = (64, 128, ) + ((256, ) if not is_fp32_out else ())
    else:
        block_ms = (get_m_alignment_for_contiguous_layout(), )
-    block_ns = tuple(range(16, 129, 8)) + (144, 160, )
+    block_ns = tuple(range(16, 129, 8)) + ((136, 152, ) if is_wgrad else (144, 160, ))
+    
+    # Avoid bank conflicts for fp32 output
+    if is_fp32_out:
+        block_ns = [x for x in block_ns if x % 16 == 8]

    fix_wave_saturate = lambda x: num_sms if x == 0 else x
    get_num_waves = lambda bm, bn: (ceil_div(ceil_div(m, bm) * ceil_div(n, bn) * num_groups, num_sms) if bm else None)
@@ -135,7 +147,7 @@ def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int,
        # Unrolling both stages and `num_former_iters` will cause large code size
        stage_candidates = (4, 3)
    for num_stages in stage_candidates:
-        best_smem_config = get_smem_config(num_stages, k, best_block_m, best_block_n)
+        best_smem_config = get_smem_config(num_stages, k, best_block_m, best_block_n, is_fp32_out=is_fp32_out, is_wgrad=is_wgrad)
        if best_smem_config[0] <= sm90_capacity:
            best_num_stages = num_stages
            break
--- a/deep_gemm/jit_kernels/wgrad_gemm.py
+++ b/deep_gemm/jit_kernels/wgrad_gemm.py
@@ -0,0 +1,171 @@
+import math
+import torch
+from typing import List, Tuple
+
+from .gemm import get_best_configs
+from .tuner import jit_tuner
+from .utils import get_num_sms, get_col_major_tma_aligned_tensor, get_tma_aligned_size
+
+# C++ code templates
+includes = ('"deep_gemm/fp8_wgrad_gemm.cuh"', )
+template = """
+using namespace deep_gemm;
+
+// Templated args from Python JIT call
+constexpr auto M = {M}, N = {N};
+constexpr auto BLOCK_M = {BLOCK_M};
+constexpr auto BLOCK_N = {BLOCK_N};
+constexpr auto BLOCK_K = 128;
+constexpr auto kNumStages = {NUM_STAGES};
+constexpr auto kLastStages = {LAST_STAGES};
+constexpr auto kNumTMAMulticast = {NUM_TMA_MULTICAST};
+constexpr auto kIsTMAMulticastOnA = {IS_TMA_MULTICAST_ON_A};
+
+// Make a templated GEMM
+using gemm_t = WgradGemm<M, N, BLOCK_M, BLOCK_N, BLOCK_K, kNumStages, kLastStages, kNumTMAMulticast, kIsTMAMulticastOnA>;
+
+// Launch kernel
+auto tma_a_desc = gemm_t::make_2d_tma_a_desc(lhs, m, k, a_stride);
+auto tma_b_desc = gemm_t::make_2d_tma_b_desc(rhs, n, k, b_stride);
+auto tma_scales_a_desc = gemm_t::make_2d_tma_scales_a_desc(lhs_scales, m, k);
+auto tma_scales_b_desc = gemm_t::make_2d_tma_scales_b_desc(rhs_scales, n, k);
+auto tma_d_desc = gemm_t::make_2d_tma_d_desc(out, m, n, d_stride);
+gemm_t::run(k,
+            tma_a_desc, tma_b_desc, tma_scales_a_desc, tma_scales_b_desc, tma_d_desc,
+            stream, num_sms, smem_size);
+"""
+
+
+def wgrad_gemm_fp8_fp8_fp32_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
+                               rhs: Tuple[torch.Tensor, torch.Tensor],
+                               out: Tuple[torch.Tensor, torch.Tensor]):
+    """
+    Do a weight gradient GEMM with FP8 inputs and FP32 output, with 1x128 LHS scaling and 1x128 RHS scaling.
+    LHS, RHS, and output tensors must be contiguous in dimension 1, i.e., stride(1) = 1.
+    RHS and RHS scaling factors are required to be transposed.
+    The LHS scaling and RHS scaling tensor require TMA-aligned transposed format, if your input does not match the requirement,
+        this function will do a transposing with a set of slow PyTorch operations.
+
+    Arguments:
+        lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m, k]`,
+             the second element is an FP32 1x128 scaling tensor for LHS of shape `[m, ⌈k / 128⌉]`.
+        rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[n, k]`.
+             the second element is an FP32 1x128 scaling tensor for RHS of shape `[n, ⌈k / 128⌉]`.
+        out: the FP32 output tensor of shape `[m, n]`, representing the result.
+    """
+    lhs, lhs_scales = lhs
+    rhs, rhs_scales = rhs
+    m, k = lhs.shape
+    n, k_ = rhs.shape
+    m_, n_ = out.shape
+
+    # Type and shape checks
+    assert m == m_ and n == n_ and k == k_
+    assert n > 0 and m > 0
+    assert lhs_scales.shape == (m, (k + 127) // 128) or lhs_scales.shape == ((k + 127) // 128, m)
+    assert rhs_scales.shape == (n, (k + 127) // 128) or rhs_scales.shape == ((k + 127) // 128, n)
+    assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
+    assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
+    assert out.dtype == torch.float
+    assert lhs.stride(1) == 1 and out.stride(1) == 1 and rhs.stride(1) == 1
+
+    lhs_stride = lhs.stride(0)
+    rhs_stride = rhs.stride(0)
+    out_stride = out.stride(0)
+
+    # LHS and RHS scales must be transposed for TMA load
+    # NOTES: `get_tma_aligned_lhs_scales` may launch a kernel if not processed by previous kernels
+    if lhs_scales.shape == ((k + 127) // 128, m):
+        lhs_scales = lhs_scales.permute(1, 0)
+        assert get_tma_aligned_size(m, 4) == m
+    else:
+        lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
+    assert lhs_scales.stride(0) == 1
+    
+    if rhs_scales.shape == ((k + 127) // 128, n):
+        rhs_scales = rhs_scales.permute(1, 0)
+        assert get_tma_aligned_size(n, 4) == n
+    else:
+        rhs_scales = get_col_major_tma_aligned_tensor(rhs_scales)
+    assert rhs_scales.stride(0) == 1
+
+    # Do nothing if `k` is zero
+    if k == 0:
+        return
+
+    aligned_n = (n + 63) // 64 * 64
+    aligned_k = (k + 127) // 128 * 128
+
+    # Auto-tuning with compilation
+    global includes, template
+    num_sms = get_num_sms()
+    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, aligned_n, aligned_k, 1, num_sms, is_fp32_out=True, is_wgrad=True)
+    last_stages = (k + 127) // 128 % num_stages
+
+    args = (lhs, lhs_scales, rhs, rhs_scales, out, m, n, k,
+            lhs_stride, rhs_stride, out_stride,
+            torch.cuda.current_stream(), num_sms, smem_config[0])
+    runtime = jit_tuner.compile_and_tune(
+        name='gemm_fp8_fp8_fp32_nt_dptp128c_dyn',
+        keys={'M': m, 'N': aligned_n, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
+              'NUM_STAGES': num_stages,
+              'LAST_STAGES': last_stages,
+              'NUM_TMA_MULTICAST': tma_multicast_config[0],
+              'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1]},
+        space=(),
+        includes=includes,
+        arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
+                  ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
+                  ('out', torch.float), ('m', int), ('n', int), ('k', int),
+                  ('a_stride', int), ('b_stride', int), ('d_stride', int),
+                  ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+        template=template,
+        args=args
+    )
+
+    # Run the kernel
+    runtime(*args)
+
+
+def k_grouped_wgrad_gemm_fp8_fp8_fp32_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
+                                         rhs: Tuple[torch.Tensor, torch.Tensor],
+                                         out: torch.Tensor,
+                                         batch_sizes: List[int]):
+    """
+    Perform a k-grouped weight gradient GEMM with FP8 inputs and FP32 output, with 1x128 LHS scaling and 1x128 RHS scaling.
+    This function handles multiple batches with varying k-dimensions, processing each batch sequentially.
+    Each batch's LHS, RHS, and output tensors must be contiguous.
+    The RHS and RHS scaling factors are required to be transposed.
+    The LHS scaling and RHS scaling tensors require TMA-aligned transposed format.
+
+    Arguments:
+        lhs: the first element is a flattened FP8 tensor (typed `torch.float8_e4m3fn`) containing all batches of LHS data,
+                 and the flattened shape is `[sum(m * k for k in batch_sizes)]`, where m is the number of rows.
+             the second element is an FP32 scaling tensor for LHS with shape `[⌈k / 128⌉ for k in batch_sizes), m]`,
+                 representing the per-128-channel scaling factors.
+        rhs: the first element is a flattened FP8 tensor (typed `torch.float8_e4m3fn`) containing all batches of RHS data,
+                 and the flattened shape is `[sum(n * k for k in batch_sizes)]`, where n is the number of rows.
+             the second element is an FP32 scaling tensor for RHS with shape `[⌈k / 128⌉ for k in batch_sizes), n]`,
+                 representing the per-128-channel scaling factors.
+        out: The FP32 output tensor of shape [num_batches, m, n], representing the result.
+        batch_sizes: A list of integers specifying the k-dimension for each batch.
+    """
+    lhs, lhs_scales = lhs[0].view(-1), lhs[1]
+    rhs, rhs_scales = rhs[0].view(-1), rhs[1]
+    num_batches, m, n = out.shape
+
+    lhs_offset, rhs_offset, scales_offset = 0, 0, 0
+
+    for idx in range(num_batches):
+        k = batch_sizes[idx]
+        A = lhs[lhs_offset:lhs_offset + m * k].view(m, k)
+        B = rhs[rhs_offset:rhs_offset + n * k].view(n, k)
+        A_scales = lhs_scales[scales_offset:scales_offset + (k + 127) // 128]
+        B_scales = rhs_scales[scales_offset:scales_offset + (k + 127) // 128]
+        D = out[idx]
+
+        wgrad_gemm_fp8_fp8_fp32_nt((A, A_scales), (B, B_scales), D)
+
+        lhs_offset += m * k
+        rhs_offset += n * k
+        scales_offset += (k + 127) // 128