Support Ampere architecture (#204)

* Update README * Update `setup.py` * Fix headers * Add `DISABLE_NVSHMEM` for APIs * Fix launch * Fix TMA settings * Fix TMA usages * Fix dlink * Separate layout kernels * Update version * Add `is_sm90_compiled` * Fix tests * Add NVLink connection checks * Update README * Fix tests * Add some comments * Minor fix * Minor fix * Fix bugs
2025-06-26 18:28:11 +00:00 · 2025-06-11 15:48:18 +08:00
parent dd13c7145c
commit b8d90fb753
16 changed files with 413 additions and 174 deletions
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -7,7 +7,7 @@ from typing import Callable, List, Tuple, Optional, Union
 import deep_ep_cpp
 # noinspection PyUnresolvedReferences
 from deep_ep_cpp import Config, EventHandle
-from .utils import EventOverlap
+from .utils import EventOverlap, check_nvlink_connections


 class Buffer:
@@ -50,6 +50,7 @@ class Buffer:
                please make sure all connections are via NVLink.
            allow_mnnvl: whether to allow MNNVL
        """
+        check_nvlink_connections(group)

        # Initialize the CPP runtime
        self.rank = group.rank()
@@ -105,6 +106,10 @@ class Buffer:
        self.runtime.sync(device_ids, ipc_handles, root_unique_id)
        assert self.runtime.is_available()

+    @staticmethod
+    def is_sm90_compiled():
+        return deep_ep_cpp.is_sm90_compiled()
+
    @staticmethod
    def set_num_sms(new_num_sms: int) -> None:
        """
--- a/deep_ep/utils.py
+++ b/deep_ep/utils.py
@@ -1,4 +1,7 @@
+import os
+import subprocess
 import torch
+import torch.distributed as dist
 from typing import Any, Optional, Tuple

 # noinspection PyUnresolvedReferences
@@ -58,3 +61,28 @@ class EventOverlap:
        """
        if self.event is not None:
            self.event.current_stream_wait()
+
+
+def check_nvlink_connections(group: dist.ProcessGroup):
+    """
+    Check NVLink connection between every pair of GPUs.
+
+    Arguments:
+        group: the communication group.
+    """
+    # Check NVLink connection
+    # NOTES: some A100 PCIE GPUs only have pairwise NVLink connection, so that we can only use EP2
+    if 'PCIE' in torch.cuda.get_device_name():
+        assert group.size() <= 2, 'No NVLink connection between all GPUs'
+        devices = os.environ.get('CUDA_VISIBLE_DEVICES', '0,1,2,3,4,5,6,7').strip(',').split(',')
+        physical_device_idx = int(devices[torch.cuda.current_device()])
+        physical_device_indices = [0, ] * group.size()
+        dist.all_gather_object(physical_device_indices, physical_device_idx, group)
+
+        # Get connection matrix from `nvidia-smi`
+        lines = subprocess.check_output(['nvidia-smi', 'topo', '-p2p', 'n']).decode('utf-8').split('\n')
+        for line in lines:
+            if line.lstrip().startswith(f'GPU{physical_device_idx}') and 'X' in line:
+                status = line.strip().lstrip(f'GPU{physical_device_idx}').split()
+                for dst_gpu_rank in physical_device_indices:
+                    assert status[dst_gpu_rank] in ('X', 'OK'), f'No NVLink connection between GPU {physical_device_idx} and GPU {dst_gpu_rank}'