mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
Add ENABLE_FAST_DEBUG
This commit is contained in:
@@ -6,6 +6,7 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
list(APPEND CUDA_NVCC_FLAGS "-DENABLE_FAST_DEBUG")
|
||||
list(APPEND CUDA_NVCC_FLAGS "-O3")
|
||||
list(APPEND CUDA_NVCC_FLAGS "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage")
|
||||
|
||||
|
||||
@@ -7,9 +7,14 @@
|
||||
#define NUM_BUFFER_ALIGNMENT_BYTES 128
|
||||
|
||||
#define FINISHED_SUM_TAG 1024
|
||||
#define NUM_WAIT_NANOSECONDS 500
|
||||
#ifndef ENABLE_FAST_DEBUG
|
||||
#define NUM_CPU_TIMEOUT_SECS 100
|
||||
#define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
|
||||
#define NUM_WAIT_NANOSECONDS 500
|
||||
#else
|
||||
#define NUM_CPU_TIMEOUT_SECS 10
|
||||
#define NUM_TIMEOUT_CYCLES 20000000000ull // 20G cycles ~= 10s
|
||||
#endif
|
||||
|
||||
#define LOW_LATENCY_SEND_PHASE 1
|
||||
#define LOW_LATENCY_RECV_PHASE 2
|
||||
|
||||
@@ -58,12 +58,9 @@ cfg.dynamicSmemBytes = smem_size;
|
||||
#define SWITCH_RDMA_RANKS(case_macro) \
|
||||
switch (num_ranks / NUM_MAX_NVL_PEERS) { \
|
||||
case 2: case_macro(2); \
|
||||
case 3: case_macro(3); \
|
||||
case 4: case_macro(4); \
|
||||
case 8: case_macro(8); \
|
||||
case 16: case_macro(16); \
|
||||
case 18: case_macro(18); \
|
||||
case 20: case_macro(20); \
|
||||
default: EP_HOST_ASSERT(false and "Unsupported RDMA ranks"); \
|
||||
} while (false)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user