#pragma once #define NUM_MAX_NVL_PEERS 8 #define NUM_MAX_RDMA_PEERS 20 #define NUM_WORKSPACE_BYTES (32 * 1024 * 1024) #define NUM_MAX_LOCAL_EXPERTS 1024 #define NUM_BUFFER_ALIGNMENT_BYTES 128 #define FINISHED_SUM_TAG 1024 #define NUM_CPU_TIMEOUT_SECS 100 #define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s #define NUM_WAIT_NANOSECONDS 500 #define LOW_LATENCY_SEND_PHASE 1 #define LOW_LATENCY_RECV_PHASE 2 // Make CLion CUDA indexing work #ifdef __CLION_IDE__ #define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier) #define __CUDACC_RDC__ // NOLINT(*-reserved-identifier) #endif // Remove Torch restrictions #ifdef __CUDA_NO_HALF_CONVERSIONS__ #undef __CUDA_NO_HALF_CONVERSIONS__ #endif #ifdef __CUDA_NO_HALF_OPERATORS__ #undef __CUDA_NO_HALF_OPERATORS__ #endif #ifdef __CUDA_NO_HALF2_OPERATORS__ #undef __CUDA_NO_HALF2_OPERATORS__ #endif #ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__ #undef __CUDA_NO_BFLOAT16_CONVERSIONS__ #endif #ifdef __CUDA_NO_BFLOAT162_OPERATORS__ #undef __CUDA_NO_BFLOAT162_OPERATORS__ #endif #include #include #include #ifndef DISABLE_SM90_FEATURES #include #else // Ampere does not support FP8 features #define __NV_E4M3 0 #define __NV_E5M2 1 typedef int __nv_fp8_interpretation_t; typedef int __nv_fp8x4_e4m3; typedef uint8_t __nv_fp8_storage_t; #endif #ifndef DISABLE_NVSHMEM #include #include #include #include #include #endif