Support Ampere architecture (#204)

* Update README * Update `setup.py` * Fix headers * Add `DISABLE_NVSHMEM` for APIs * Fix launch * Fix TMA settings * Fix TMA usages * Fix dlink * Separate layout kernels * Update version * Add `is_sm90_compiled` * Fix tests * Add NVLink connection checks * Update README * Fix tests * Add some comments * Minor fix * Minor fix * Fix bugs
2025-06-26 18:28:11 +00:00 · 2025-06-11 15:48:18 +08:00
parent dd13c7145c
commit b8d90fb753
16 changed files with 413 additions and 174 deletions
--- a/setup.py
+++ b/setup.py
@@ -6,34 +6,76 @@ from torch.utils.cpp_extension import BuildExtension, CUDAExtension

 if __name__ == '__main__':
    nvshmem_dir = os.getenv('NVSHMEM_DIR', None)
-    assert nvshmem_dir is not None and os.path.exists(nvshmem_dir), 'Failed to find NVSHMEM'
-    print(f'NVSHMEM directory: {nvshmem_dir}')
+    disable_nvshmem = nvshmem_dir is None
+    if disable_nvshmem:
+        print('Warning: `NVSHMEM_DIR` is not specified, all internode and low-latency features are disabled\n')
+    else:
+        assert os.path.exists(nvshmem_dir), f'Failed to find NVSHMEM: {nvshmem_dir}'

-    # TODO: currently, we only support Hopper architecture, we may add Ampere support later
-    if os.getenv('TORCH_CUDA_ARCH_LIST', None) is None:
-        os.environ['TORCH_CUDA_ARCH_LIST'] = '9.0'
    cxx_flags = ['-O3', '-Wno-deprecated-declarations', '-Wno-unused-variable',
                 '-Wno-sign-compare', '-Wno-reorder', '-Wno-attributes']
-    nvcc_flags = ['-O3', '-Xcompiler', '-O3', '-rdc=true', '--ptxas-options=--register-usage-level=10']
-    include_dirs = ['csrc/', f'{nvshmem_dir}/include']
-    sources = ['csrc/deep_ep.cpp',
-               'csrc/kernels/runtime.cu', 'csrc/kernels/intranode.cu',
-               'csrc/kernels/internode.cu', 'csrc/kernels/internode_ll.cu']
-    library_dirs = [f'{nvshmem_dir}/lib']
+    nvcc_flags = ['-O3', '-Xcompiler', '-O3']
+    sources = ['csrc/deep_ep.cpp', 'csrc/kernels/runtime.cu', 'csrc/kernels/layout.cu', 'csrc/kernels/intranode.cu']
+    include_dirs = ['csrc/']
+    library_dirs = []
+    nvcc_dlink = []
+    extra_link_args = []
+
+    # NVSHMEM flags
+    if disable_nvshmem:
+        cxx_flags.append('-DDISABLE_NVSHMEM')
+        nvcc_flags.append('-DDISABLE_NVSHMEM')
+    else:
+        sources.extend(['csrc/kernels/internode.cu', 'csrc/kernels/internode_ll.cu'])
+        include_dirs.extend([f'{nvshmem_dir}/include'])
+        library_dirs.extend([f'{nvshmem_dir}/lib'])
+        nvcc_dlink.extend(['-dlink', f'-L{nvshmem_dir}/lib', '-lnvshmem'])
+        extra_link_args.extend(['-l:libnvshmem.a', '-l:nvshmem_bootstrap_uid.so', f'-Wl,-rpath,{nvshmem_dir}/lib'])
+
+    if int(os.getenv('DISABLE_SM90_FEATURES', 0)):
+        # Prefer A100
+        os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '8.0')
+
+        # Disable some SM90 features: FP8, launch methods, and TMA
+        cxx_flags.append('-DDISABLE_SM90_FEATURES')
+        nvcc_flags.append('-DDISABLE_SM90_FEATURES')
+
+        # Disable internode and low-latency kernels
+        assert disable_nvshmem
+
+        # Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
+        assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
+        os.environ['DISABLE_AGGRESSIVE_PTX_INSTRS'] = '1'
+    else:
+        # Prefer H800 series
+        os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '9.0')
+
+        # CUDA 12 flags
+        nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])

    # Disable aggressive PTX instructions
    if int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', '0')):
        cxx_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
        nvcc_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')

-    # Disable DLTO (default by PyTorch)
-    nvcc_dlink = ['-dlink', f'-L{nvshmem_dir}/lib', '-lnvshmem']
-    extra_link_args = ['-l:libnvshmem.a', '-l:nvshmem_bootstrap_uid.so', f'-Wl,-rpath,{nvshmem_dir}/lib']
+    # Put them together
    extra_compile_args = {
        'cxx': cxx_flags,
        'nvcc': nvcc_flags,
-        'nvcc_dlink': nvcc_dlink
    }
+    if len(nvcc_dlink) > 0:
+        extra_compile_args['nvcc_dlink'] = nvcc_dlink
+
+    # Summary
+    print(f'Build summary:')
+    print(f' > Sources: {sources}')
+    print(f' > Includes: {include_dirs}')
+    print(f' > Libraries: {library_dirs}')
+    print(f' > Compilation flags: {extra_compile_args}')
+    print(f' > Link flags: {extra_link_args}')
+    print(f' > Arch list: {os.environ["TORCH_CUDA_ARCH_LIST"]}')
+    print(f' > NVSHMEM path: {nvshmem_dir}')
+    print()

    # noinspection PyBroadException
    try:
@@ -44,7 +86,7 @@ if __name__ == '__main__':

    setuptools.setup(
        name='deep_ep',
-        version='1.0.0' + revision,
+        version='1.1.0' + revision,
        packages=setuptools.find_packages(
            include=['deep_ep']
        ),