mirror of
https://github.com/deepseek-ai/DeepGEMM
synced 2025-06-26 23:15:49 +00:00
* Init weight gradient kernels. * Support unaligned n,k and gmem stride * Update docs * Several cleanups * Remove restrictions on N * Add stride(0) assertions --------- Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com>
16 lines
425 B
Python
16 lines
425 B
Python
import torch
|
|
|
|
from . import jit
|
|
from .jit_kernels import (
|
|
gemm_fp8_fp8_bf16_nt,
|
|
m_grouped_gemm_fp8_fp8_bf16_nt_contiguous,
|
|
m_grouped_gemm_fp8_fp8_bf16_nt_masked,
|
|
wgrad_gemm_fp8_fp8_fp32_nt,
|
|
k_grouped_wgrad_gemm_fp8_fp8_fp32_nt,
|
|
ceil_div,
|
|
set_num_sms, get_num_sms,
|
|
get_col_major_tma_aligned_tensor,
|
|
get_m_alignment_for_contiguous_layout
|
|
)
|
|
from .utils import bench, bench_kineto, calc_diff
|