#pragma once #define NUM_MAX_NVL_PEERS 8 #define NUM_MAX_RDMA_PEERS 20 #define NUM_MAX_FIFO_SLOTS 32768 #define NUM_WORKSPACE_BYTES (32 * 1024 * 1024) #define NUM_MAX_LOCAL_EXPERTS 1024 #define NUM_BUFFER_ALIGNMENT_BYTES 128 #define FINISHED_SUM_TAG 1024 #define NUM_CPU_TIMEOUT_SECS 100 #define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s #define NUM_WAIT_NANOSECONDS 500 #define LOW_LATENCY_SEND_PHASE 1 #define LOW_LATENCY_RECV_PHASE 2 // Make CLion CUDA indexing work #ifdef __CLION_IDE__ #define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier) #define __CUDACC_RDC__ // NOLINT(*-reserved-identifier) __host__ __device__ __forceinline__ void host_device_printf(const char* format, ...) { asm volatile("trap;"); } #define printf host_device_printf #endif // Remove Torch restrictions #ifdef __CUDA_NO_HALF_CONVERSIONS__ #undef __CUDA_NO_HALF_CONVERSIONS__ #endif #ifdef __CUDA_NO_HALF_OPERATORS__ #undef __CUDA_NO_HALF_OPERATORS__ #endif #ifdef __CUDA_NO_HALF2_OPERATORS__ #undef __CUDA_NO_HALF2_OPERATORS__ #endif #ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__ #undef __CUDA_NO_BFLOAT16_CONVERSIONS__ #endif #ifdef __CUDA_NO_BFLOAT162_OPERATORS__ #undef __CUDA_NO_BFLOAT162_OPERATORS__ #endif #include #include #include #include #include #include #include #include