Refactor JIT compilation (+NVRTC support) (#94)

* [wip] refactor: compile to .cubin Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * refactor: compile to .cubin and add NVRTC option Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * fix: compiler version Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: compat for old drivers Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: save kernel name to file Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: fix win compat Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * fix: windows compat Signed-off-by: Gabriel Wu <13583761+lucifer1004@users.noreply.github.com> * feat: make API more general Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * feat: drop support for CUDA<12.3 Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * doc: update README Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> * Some lints and refactor * Refactor runtime * Several fixes * Refactor environment variables * Code format * Add a TODO * Compatible with CUDA 12.3 * Fix indent * Fix typing * Drop support for Windows * Add a TODO --------- Signed-off-by: Zihua Wu <13583761+lucifer1004@users.noreply.github.com> Signed-off-by: Gabriel Wu <13583761+lucifer1004@users.noreply.github.com> Co-authored-by: Chenggang Zhao <chenggangz@deepseek.com>
2025-06-26 23:15:49 +00:00 · 2025-05-07 11:38:14 +08:00
parent d374456787
commit bfe983c4c2
19 changed files with 909 additions and 660 deletions
--- a/deep_gemm/jit_kernels/tuner.py
+++ b/deep_gemm/jit_kernels/tuner.py
@@ -1,9 +1,10 @@
 import copy
 import os
 import torch
-from typing import Any, Dict
+import cuda.bindings.driver as cbd
+from typing import Any, Callable, Dict, Type, Tuple

-from ..jit import build, cpp_format, generate, Runtime
+from ..jit import build, Runtime


 class JITTuner:
@@ -11,22 +12,21 @@ class JITTuner:
        self.tuned = {}

    def compile_and_tune(self, name: str, keys: Dict[str, Any], space: tuple,
-                         includes: tuple, arg_defs: tuple, template: str, args: tuple) -> Runtime:
-        # NOTES: we always assume the space and template will not change
-        # We also assume the GPU device will not be changed
+                         kwargs: Dict[str, Any], runtime_cls: Type[Runtime]) -> Tuple[Runtime, Dict[str, Any]]:
+        # NOTES: we always assume the space, template and GPU devices will not change
        # NOTES: the function must have no accumulated side effects
        keys = {k: keys[k] for k in sorted(keys.keys())}
        signature = (name, f'{keys}')
        if signature in self.tuned:
-            if os.getenv('DG_JIT_DEBUG', None):
+            if int(os.getenv('DG_JIT_DEBUG', 0)):
                print(f'Using cached JIT kernel {name} with keys {keys}')
            return self.tuned[signature]

-        if os.getenv('DG_JIT_DEBUG', None):
+        if int(os.getenv('DG_JIT_DEBUG', 0)):
            print(f'Auto-tuning JIT kernel {name} with keys {keys}')

        assert signature not in self.tuned
-        assert args is not None
+        assert kwargs is not None
        space = (dict(), ) if len(space) == 0 else space

        kernels = []
@@ -34,30 +34,31 @@ class JITTuner:
            assert isinstance(tuned_keys, dict)
            full_keys = copy.deepcopy(keys)
            full_keys.update(tuned_keys)
-            code = generate(includes, arg_defs, cpp_format(template, full_keys))
-
-            # Illegal build must raise errors
-            kernels.append((build(name, arg_defs, code), tuned_keys))
+            code = runtime_cls.generate(**kwargs, **full_keys)
+            kernels.append((build(name, code, runtime_cls), full_keys))

+        # TODO: fix tuning with space > 1
        best_runtime, best_time, best_keys = None, None, None
        for runtime, tuned_keys in kernels:
            if len(space) > 1:
                # Check kernel validity
-                return_code = runtime(*args)
-                if return_code != 0:
-                    # Pass illegal kernels, e.g. insufficient shared memory capacity
-                    if os.getenv('DG_JIT_DEBUG', None):
+                return_code = runtime(**tuned_keys, **kwargs)
+                if return_code != cbd.CUresult.CUDA_SUCCESS:
+                    # Pass illegal kernels, e.g., insufficient shared memory capacity
+                    if int(os.getenv('DG_JIT_DEBUG', 0)):
                        print(f'Illegal JIT kernel {name} with keys {keys} and tuned keys {tuned_keys}: error code {return_code}')
                    continue

                # Measure performance with L2 flush and a large GEMM kernel before to reduce overhead between kernels
                start_event = torch.cuda.Event(enable_timing=True)
                end_event = torch.cuda.Event(enable_timing=True)
-                torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda').zero_()
-                torch.randn((8192, 8192), dtype=torch.float, device='cuda') @ torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+                torch.empty(int(256e6 // 4), dtype=torch.int,
+                            device='cuda').zero_()
+                torch.randn((8192, 8192), dtype=torch.float, device='cuda') @ torch.randn(
+                    (8192, 8192), dtype=torch.float, device='cuda')
                start_event.record()
                for i in range(20):
-                    assert runtime(*args) == 0
+                    assert runtime(**tuned_keys, **kwargs) == cbd.CUresult.CUDA_SUCCESS
                end_event.record()
                end_event.synchronize()
                elapsed_time = start_event.elapsed_time(end_event)
@@ -67,15 +68,16 @@ class JITTuner:
            # Compare if better
            if best_time is None or elapsed_time < best_time:
                best_runtime, best_time, best_keys = runtime, elapsed_time, tuned_keys
-            if os.getenv('DG_JIT_DEBUG', None):
+            if int(os.getenv('DG_JIT_DEBUG', 0)):
                print(f'Tuned JIT kernel {name} with keys {keys} and tuned keys {tuned_keys} has time {elapsed_time}')
        assert best_runtime is not None, f'Failed to tune JIT kernel {name} with keys {keys}'

        # Cache the best runtime and return
-        if os.getenv('DG_JIT_DEBUG', None) or os.getenv('DG_PRINT_AUTOTUNE', None):
-            print(f'Best JIT kernel {name} with keys {keys} has tuned keys {best_keys} and time {best_time}')
-        self.tuned[signature] = best_runtime
-        return best_runtime
+        if int(os.getenv('DG_JIT_DEBUG', 0)) or int(os.getenv('DG_PRINT_AUTOTUNE', 0)):
+            print(
+                f'Best JIT kernel {name} with keys {keys} has tuned keys {best_keys} and time {best_time}')
+        self.tuned[signature] = (best_runtime, best_keys)
+        return best_runtime, best_keys


 jit_tuner = JITTuner()