mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-05-05 20:44:48 +00:00
51 lines
1.5 KiB
Plaintext
51 lines
1.5 KiB
Plaintext
#pragma once
|
|
|
|
#define NUM_MAX_NVL_PEERS 8
|
|
#define NUM_MAX_RDMA_PEERS 20
|
|
#define NUM_MAX_FIFO_SLOTS 32768
|
|
#define NUM_WORKSPACE_BYTES (32 * 1024 * 1024)
|
|
#define NUM_MAX_LOCAL_EXPERTS 1024
|
|
#define NUM_BUFFER_ALIGNMENT_BYTES 128
|
|
|
|
#define FINISHED_SUM_TAG 1024
|
|
#define NUM_CPU_TIMEOUT_SECS 100
|
|
#define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
|
|
#define NUM_WAIT_NANOSECONDS 500
|
|
|
|
#define LOW_LATENCY_SEND_PHASE 1
|
|
#define LOW_LATENCY_RECV_PHASE 2
|
|
|
|
// Make CLion CUDA indexing work
|
|
#ifdef __CLION_IDE__
|
|
#define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier)
|
|
#define __CUDACC_RDC__ // NOLINT(*-reserved-identifier)
|
|
__host__ __device__ __forceinline__ void host_device_printf(const char* format, ...) { asm volatile("trap;"); }
|
|
#define printf host_device_printf
|
|
#endif
|
|
|
|
// Remove Torch restrictions
|
|
#ifdef __CUDA_NO_HALF_CONVERSIONS__
|
|
#undef __CUDA_NO_HALF_CONVERSIONS__
|
|
#endif
|
|
#ifdef __CUDA_NO_HALF_OPERATORS__
|
|
#undef __CUDA_NO_HALF_OPERATORS__
|
|
#endif
|
|
#ifdef __CUDA_NO_HALF2_OPERATORS__
|
|
#undef __CUDA_NO_HALF2_OPERATORS__
|
|
#endif
|
|
#ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__
|
|
#undef __CUDA_NO_BFLOAT16_CONVERSIONS__
|
|
#endif
|
|
#ifdef __CUDA_NO_BFLOAT162_OPERATORS__
|
|
#undef __CUDA_NO_BFLOAT162_OPERATORS__
|
|
#endif
|
|
|
|
#include <cuda_bf16.h>
|
|
#include <cuda_fp8.h>
|
|
#include <cuda_runtime.h>
|
|
#include <nvshmem.h>
|
|
#include <nvshmemx.h>
|
|
#include <infiniband/mlx5dv.h>
|
|
#include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
|
|
#include <device_host_transport/nvshmem_common_ibgda.h>
|