mirror of
https://github.com/deepseek-ai/DeepGEMM
synced 2025-06-26 23:15:49 +00:00
Refactor JIT compilation (+NVRTC support) (#94)
* [wip] refactor: compile to .cubin Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * refactor: compile to .cubin and add NVRTC option Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * fix: compiler version Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: compat for old drivers Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: save kernel name to file Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: fix win compat Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * fix: windows compat Signed-off-by: Gabriel Wu <13583761+lucifer1004@users.noreply.github.com> * feat: make API more general Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: drop support for CUDA<12.3 Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * doc: update README Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * Some lints and refactor * Refactor runtime * Several fixes * Refactor environment variables * Code format * Add a TODO * Compatible with CUDA 12.3 * Fix indent * Fix typing * Drop support for Windows * Add a TODO --------- Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> Signed-off-by: Gabriel Wu <13583761+lucifer1004@users.noreply.github.com> Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com>
This commit is contained in:
@@ -80,25 +80,10 @@ class suppress_stdout_stderr:
|
||||
def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output: bool = False,
|
||||
trace_path: str = None, barrier_comm_profiling: bool = False, flush_l2: bool = True):
|
||||
# Conflict with Nsight Systems
|
||||
using_nsys = os.environ.get('DG_NSYS_PROFILING', False)
|
||||
using_nsys = int(os.environ.get('DG_NSYS_PROFILING', 0))
|
||||
|
||||
# By default, flush L2 with an excessive 8GB memset to give the GPU some (literal) chill time without full idle
|
||||
# this avoid thermal throttling while keeping DVFS at max clocks (slight gain vs sleep / more consistent on GH200)
|
||||
sleep_between_tests = 0.0
|
||||
flush_l2_size = int(8e9 // 4)
|
||||
if os.environ.get('DG_BENCH_DISABLE_L2_FLUSH', False):
|
||||
flush_l2 = False
|
||||
if os.environ.get('DG_BENCH_POWER_LIMITED', False):
|
||||
# if we want to be thermally limited, we need to run many iterations non-stop for a fairly long time
|
||||
# and spend as little time as possible doing memset and other setup work (80MiB should be enough to flush L2)
|
||||
num_tests = 2000
|
||||
flush_l2_size = int(80e6 // 4)
|
||||
sleep_val = os.environ.get('DG_BENCH_SLEEP_BETWEEN_TESTS', False)
|
||||
if sleep_val:
|
||||
try:
|
||||
sleep_between_tests = float(sleep_val)
|
||||
except ValueError:
|
||||
pass # Keep default
|
||||
|
||||
# For some auto-tuning kernels with prints
|
||||
fn()
|
||||
@@ -117,8 +102,6 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output:
|
||||
lhs @ rhs
|
||||
dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
|
||||
for _ in range(num_tests):
|
||||
if sleep_between_tests > 0.0:
|
||||
time.sleep(sleep_between_tests)
|
||||
if flush_l2:
|
||||
torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
|
||||
fn()
|
||||
|
||||
Reference in New Issue
Block a user