mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
Code cleanup and bug fixed
This commit is contained in:
@@ -31,7 +31,8 @@ class Buffer:
|
||||
|
||||
def __init__(self, group: dist.ProcessGroup,
|
||||
num_nvl_bytes: int = 0, num_rdma_bytes: int = 0,
|
||||
low_latency_mode: bool = False, num_qps_per_rank: int = 12) -> None:
|
||||
low_latency_mode: bool = False, num_qps_per_rank: int = 12,
|
||||
allow_nvlink_for_low_latency_mode: bool = False) -> None:
|
||||
"""
|
||||
Initialize the communication buffer.
|
||||
|
||||
@@ -42,6 +43,10 @@ class Buffer:
|
||||
low_latency_mode: whether to enable low-latency mode.
|
||||
num_qps_per_rank: the number of QPs for RDMA, the low-latency mode requires that this number equals
|
||||
to the number of local experts.
|
||||
allow_nvlink_for_low_latency_mode: whether allow NVLink traffic for low-latency mode, you should notice
|
||||
this is somehow incompatible with the hook-based overlapping.
|
||||
Warning: PCIe connections may lead to errors due to memory ordering issues,
|
||||
please make sure all connections are via NVLink.
|
||||
"""
|
||||
|
||||
# Initialize the CPP runtime
|
||||
@@ -68,8 +73,7 @@ class Buffer:
|
||||
if self.runtime.get_num_rdma_ranks() > 1 or low_latency_mode:
|
||||
# Enable IBGDA
|
||||
assert num_qps_per_rank > 0
|
||||
if not os.getenv("NVSHMEM_DISABLE_P2P"):
|
||||
os.environ['NVSHMEM_DISABLE_P2P'] = '1'
|
||||
os.environ['NVSHMEM_DISABLE_P2P'] = '0' if allow_nvlink_for_low_latency_mode else '1'
|
||||
os.environ['NVSHMEM_IB_ENABLE_IBGDA'] = '1'
|
||||
os.environ['NVSHMEM_IBGDA_NIC_HANDLER'] = 'gpu'
|
||||
os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}'
|
||||
|
||||
Reference in New Issue
Block a user