mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
* Fully remove FIFO slots * Fully remove FIFO buffers * Minor fix styles * Fix some typos * Bugs fixed * Cleanup `ibgda_poll_cq`
48 lines
1.3 KiB
Plaintext
48 lines
1.3 KiB
Plaintext
#pragma once
|
|
|
|
#define NUM_MAX_NVL_PEERS 8
|
|
#define NUM_MAX_RDMA_PEERS 20
|
|
#define NUM_WORKSPACE_BYTES (32 * 1024 * 1024)
|
|
#define NUM_MAX_LOCAL_EXPERTS 1024
|
|
#define NUM_BUFFER_ALIGNMENT_BYTES 128
|
|
|
|
#define FINISHED_SUM_TAG 1024
|
|
#define NUM_CPU_TIMEOUT_SECS 100
|
|
#define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
|
|
#define NUM_WAIT_NANOSECONDS 500
|
|
|
|
#define LOW_LATENCY_SEND_PHASE 1
|
|
#define LOW_LATENCY_RECV_PHASE 2
|
|
|
|
// Make CLion CUDA indexing work
|
|
#ifdef __CLION_IDE__
|
|
#define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier)
|
|
#define __CUDACC_RDC__ // NOLINT(*-reserved-identifier)
|
|
#endif
|
|
|
|
// Remove Torch restrictions
|
|
#ifdef __CUDA_NO_HALF_CONVERSIONS__
|
|
#undef __CUDA_NO_HALF_CONVERSIONS__
|
|
#endif
|
|
#ifdef __CUDA_NO_HALF_OPERATORS__
|
|
#undef __CUDA_NO_HALF_OPERATORS__
|
|
#endif
|
|
#ifdef __CUDA_NO_HALF2_OPERATORS__
|
|
#undef __CUDA_NO_HALF2_OPERATORS__
|
|
#endif
|
|
#ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__
|
|
#undef __CUDA_NO_BFLOAT16_CONVERSIONS__
|
|
#endif
|
|
#ifdef __CUDA_NO_BFLOAT162_OPERATORS__
|
|
#undef __CUDA_NO_BFLOAT162_OPERATORS__
|
|
#endif
|
|
|
|
#include <cuda_bf16.h>
|
|
#include <cuda_fp8.h>
|
|
#include <cuda_runtime.h>
|
|
#include <nvshmem.h>
|
|
#include <nvshmemx.h>
|
|
#include <infiniband/mlx5dv.h>
|
|
#include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
|
|
#include <device_host_transport/nvshmem_common_ibgda.h>
|