mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
* Update README * Update `setup.py` * Fix headers * Add `DISABLE_NVSHMEM` for APIs * Fix launch * Fix TMA settings * Fix TMA usages * Fix dlink * Separate layout kernels * Update version * Add `is_sm90_compiled` * Fix tests * Add NVLink connection checks * Update README * Fix tests * Add some comments * Minor fix * Minor fix * Fix bugs
62 lines
1.5 KiB
Plaintext
62 lines
1.5 KiB
Plaintext
#pragma once
|
|
|
|
#define NUM_MAX_NVL_PEERS 8
|
|
#define NUM_MAX_RDMA_PEERS 20
|
|
#define NUM_WORKSPACE_BYTES (32 * 1024 * 1024)
|
|
#define NUM_MAX_LOCAL_EXPERTS 1024
|
|
#define NUM_BUFFER_ALIGNMENT_BYTES 128
|
|
|
|
#define FINISHED_SUM_TAG 1024
|
|
#define NUM_CPU_TIMEOUT_SECS 100
|
|
#define NUM_TIMEOUT_CYCLES 200000000000ull // 200G cycles ~= 100s
|
|
#define NUM_WAIT_NANOSECONDS 500
|
|
|
|
#define LOW_LATENCY_SEND_PHASE 1
|
|
#define LOW_LATENCY_RECV_PHASE 2
|
|
|
|
// Make CLion CUDA indexing work
|
|
#ifdef __CLION_IDE__
|
|
#define __CUDA_ARCH__ 900 // NOLINT(*-reserved-identifier)
|
|
#define __CUDACC_RDC__ // NOLINT(*-reserved-identifier)
|
|
#endif
|
|
|
|
// Remove Torch restrictions
|
|
#ifdef __CUDA_NO_HALF_CONVERSIONS__
|
|
#undef __CUDA_NO_HALF_CONVERSIONS__
|
|
#endif
|
|
#ifdef __CUDA_NO_HALF_OPERATORS__
|
|
#undef __CUDA_NO_HALF_OPERATORS__
|
|
#endif
|
|
#ifdef __CUDA_NO_HALF2_OPERATORS__
|
|
#undef __CUDA_NO_HALF2_OPERATORS__
|
|
#endif
|
|
#ifdef __CUDA_NO_BFLOAT16_CONVERSIONS__
|
|
#undef __CUDA_NO_BFLOAT16_CONVERSIONS__
|
|
#endif
|
|
#ifdef __CUDA_NO_BFLOAT162_OPERATORS__
|
|
#undef __CUDA_NO_BFLOAT162_OPERATORS__
|
|
#endif
|
|
|
|
#include <cstdint>
|
|
#include <cuda_bf16.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#ifndef DISABLE_SM90_FEATURES
|
|
#include <cuda_fp8.h>
|
|
#else
|
|
// Ampere does not support FP8 features
|
|
#define __NV_E4M3 0
|
|
#define __NV_E5M2 1
|
|
typedef int __nv_fp8_interpretation_t;
|
|
typedef int __nv_fp8x4_e4m3;
|
|
typedef uint8_t __nv_fp8_storage_t;
|
|
#endif
|
|
|
|
#ifndef DISABLE_NVSHMEM
|
|
#include <nvshmem.h>
|
|
#include <nvshmemx.h>
|
|
#include <infiniband/mlx5dv.h>
|
|
#include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
|
|
#include <device_host_transport/nvshmem_common_ibgda.h>
|
|
#endif
|