mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
Support Ampere architecture (#204)
* Update README * Update `setup.py` * Fix headers * Add `DISABLE_NVSHMEM` for APIs * Fix launch * Fix TMA settings * Fix TMA usages * Fix dlink * Separate layout kernels * Update version * Add `is_sm90_compiled` * Fix tests * Add NVLink connection checks * Update README * Fix tests * Add some comments * Minor fix * Minor fix * Fix bugs
This commit is contained in:
@@ -7,7 +7,7 @@ from typing import Callable, List, Tuple, Optional, Union
|
||||
import deep_ep_cpp
|
||||
# noinspection PyUnresolvedReferences
|
||||
from deep_ep_cpp import Config, EventHandle
|
||||
from .utils import EventOverlap
|
||||
from .utils import EventOverlap, check_nvlink_connections
|
||||
|
||||
|
||||
class Buffer:
|
||||
@@ -50,6 +50,7 @@ class Buffer:
|
||||
please make sure all connections are via NVLink.
|
||||
allow_mnnvl: whether to allow MNNVL
|
||||
"""
|
||||
check_nvlink_connections(group)
|
||||
|
||||
# Initialize the CPP runtime
|
||||
self.rank = group.rank()
|
||||
@@ -105,6 +106,10 @@ class Buffer:
|
||||
self.runtime.sync(device_ids, ipc_handles, root_unique_id)
|
||||
assert self.runtime.is_available()
|
||||
|
||||
@staticmethod
|
||||
def is_sm90_compiled():
|
||||
return deep_ep_cpp.is_sm90_compiled()
|
||||
|
||||
@staticmethod
|
||||
def set_num_sms(new_num_sms: int) -> None:
|
||||
"""
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
import os
|
||||
import subprocess
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from typing import Any, Optional, Tuple
|
||||
|
||||
# noinspection PyUnresolvedReferences
|
||||
@@ -58,3 +61,28 @@ class EventOverlap:
|
||||
"""
|
||||
if self.event is not None:
|
||||
self.event.current_stream_wait()
|
||||
|
||||
|
||||
def check_nvlink_connections(group: dist.ProcessGroup):
|
||||
"""
|
||||
Check NVLink connection between every pair of GPUs.
|
||||
|
||||
Arguments:
|
||||
group: the communication group.
|
||||
"""
|
||||
# Check NVLink connection
|
||||
# NOTES: some A100 PCIE GPUs only have pairwise NVLink connection, so that we can only use EP2
|
||||
if 'PCIE' in torch.cuda.get_device_name():
|
||||
assert group.size() <= 2, 'No NVLink connection between all GPUs'
|
||||
devices = os.environ.get('CUDA_VISIBLE_DEVICES', '0,1,2,3,4,5,6,7').strip(',').split(',')
|
||||
physical_device_idx = int(devices[torch.cuda.current_device()])
|
||||
physical_device_indices = [0, ] * group.size()
|
||||
dist.all_gather_object(physical_device_indices, physical_device_idx, group)
|
||||
|
||||
# Get connection matrix from `nvidia-smi`
|
||||
lines = subprocess.check_output(['nvidia-smi', 'topo', '-p2p', 'n']).decode('utf-8').split('\n')
|
||||
for line in lines:
|
||||
if line.lstrip().startswith(f'GPU{physical_device_idx}') and 'X' in line:
|
||||
status = line.strip().lstrip(f'GPU{physical_device_idx}').split()
|
||||
for dst_gpu_rank in physical_device_indices:
|
||||
assert status[dst_gpu_rank] in ('X', 'OK'), f'No NVLink connection between GPU {physical_device_idx} and GPU {dst_gpu_rank}'
|
||||
|
||||
Reference in New Issue
Block a user