diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt index 005607a..3f51c27 100644 --- a/csrc/CMakeLists.txt +++ b/csrc/CMakeLists.txt @@ -6,6 +6,7 @@ set(CMAKE_VERBOSE_MAKEFILE ON) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC") set(CUDA_SEPARABLE_COMPILATION ON) +list(APPEND CUDA_NVCC_FLAGS "-DENABLE_FAST_DEBUG") list(APPEND CUDA_NVCC_FLAGS "-O3") list(APPEND CUDA_NVCC_FLAGS "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage") diff --git a/csrc/kernels/configs.cuh b/csrc/kernels/configs.cuh index 8893b79..12a0c26 100644 --- a/csrc/kernels/configs.cuh +++ b/csrc/kernels/configs.cuh @@ -7,9 +7,14 @@ #define NUM_BUFFER_ALIGNMENT_BYTES 128 #define FINISHED_SUM_TAG 1024 +#define NUM_WAIT_NANOSECONDS 500 +#ifndef ENABLE_FAST_DEBUG #define NUM_CPU_TIMEOUT_SECS 100 #define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s -#define NUM_WAIT_NANOSECONDS 500 +#else +#define NUM_CPU_TIMEOUT_SECS 10 +#define NUM_TIMEOUT_CYCLES 20000000000ull // 20G cycles ~= 10s +#endif #define LOW_LATENCY_SEND_PHASE 1 #define LOW_LATENCY_RECV_PHASE 2 diff --git a/csrc/kernels/launch.cuh b/csrc/kernels/launch.cuh index 92f3295..7fd2815 100644 --- a/csrc/kernels/launch.cuh +++ b/csrc/kernels/launch.cuh @@ -58,12 +58,9 @@ cfg.dynamicSmemBytes = smem_size; #define SWITCH_RDMA_RANKS(case_macro) \ switch (num_ranks / NUM_MAX_NVL_PEERS) { \ case 2: case_macro(2); \ - case 3: case_macro(3); \ case 4: case_macro(4); \ case 8: case_macro(8); \ case 16: case_macro(16); \ - case 18: case_macro(18); \ - case 20: case_macro(20); \ default: EP_HOST_ASSERT(false and "Unsupported RDMA ranks"); \ } while (false)