mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
Support Ampere architecture (#204)
* Update README * Update `setup.py` * Fix headers * Add `DISABLE_NVSHMEM` for APIs * Fix launch * Fix TMA settings * Fix TMA usages * Fix dlink * Separate layout kernels * Update version * Add `is_sm90_compiled` * Fix tests * Add NVLink connection checks * Update README * Fix tests * Add some comments * Minor fix * Minor fix * Fix bugs
This commit is contained in:
@@ -28,6 +28,17 @@ void finalize();
|
||||
|
||||
} // namespace internode
|
||||
|
||||
// Layout kernels
|
||||
namespace layout {
|
||||
|
||||
void get_dispatch_layout(const int64_t* topk_idx,
|
||||
int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
|
||||
int* num_tokens_per_expert, bool* is_token_in_rank,
|
||||
int num_tokens, int num_topk, int num_ranks, int num_experts,
|
||||
cudaStream_t stream);
|
||||
|
||||
} // namespace layout
|
||||
|
||||
// Intranode kernels
|
||||
namespace intranode {
|
||||
|
||||
@@ -69,12 +80,6 @@ namespace internode {
|
||||
|
||||
int get_source_meta_bytes();
|
||||
|
||||
void get_dispatch_layout(const int64_t* topk_idx,
|
||||
int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
|
||||
int* num_tokens_per_expert, bool* is_token_in_rank,
|
||||
int num_tokens, int num_topk, int num_ranks, int num_experts,
|
||||
cudaStream_t stream);
|
||||
|
||||
void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
|
||||
const int* num_tokens_per_rdma_rank, int* moe_recv_rdma_counter_mapped,
|
||||
const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
|
||||
|
||||
Reference in New Issue
Block a user