mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
Reduce NVSHMEM gpu memory usage and disable MNNVL. (#190)
Co-authored-by: Shangyan Zhou <sy.zhou@deepseek.com>
This commit is contained in:
parent
d8dd185c68
commit
df4debe30c
@ -79,9 +79,18 @@ class Buffer:
|
||||
os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}'
|
||||
# Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
|
||||
os.environ['NVSHMEM_QP_DEPTH'] = '1024'
|
||||
|
||||
# Reduce gpu memory usage
|
||||
# 6 default teams + 1 extra team
|
||||
os.environ['NVSHMEM_MAX_TEAMS'] = '7'
|
||||
# Disable NVLink SHArP
|
||||
os.environ['NVSHMEM_DISABLE_NVLS'] = '1'
|
||||
# NOTES: NVSHMEM initialization requires at least 256 MiB
|
||||
os.environ['NVSHMEM_CUMEM_GRANULARITY'] = f'{2 ** 29}'
|
||||
|
||||
# Disable multi-node NVLink detection
|
||||
os.environ['NVSHMEM_DISABLE_MNNVL'] = '1'
|
||||
|
||||
# Synchronize using the root ID
|
||||
nvshmem_unique_ids = [None, ] * self.group_size
|
||||
if (low_latency_mode and self.rank == 0) or (not low_latency_mode and self.runtime.get_rdma_rank() == 0):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user