Support Ampere architecture (#204)

* Update README * Update `setup.py` * Fix headers * Add `DISABLE_NVSHMEM` for APIs * Fix launch * Fix TMA settings * Fix TMA usages * Fix dlink * Separate layout kernels * Update version * Add `is_sm90_compiled` * Fix tests * Add NVLink connection checks * Update README * Fix tests * Add some comments * Minor fix * Minor fix * Fix bugs
2025-06-26 18:28:11 +00:00 · 2025-06-11 15:48:18 +08:00
parent dd13c7145c
commit b8d90fb753
16 changed files with 413 additions and 174 deletions
--- a/csrc/kernels/api.cuh
+++ b/csrc/kernels/api.cuh
@@ -28,6 +28,17 @@ void finalize();

 } // namespace internode

+// Layout kernels
+namespace layout {
+
+void get_dispatch_layout(const int64_t* topk_idx,
+                         int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
+                         int* num_tokens_per_expert, bool* is_token_in_rank,
+                         int num_tokens, int num_topk, int num_ranks, int num_experts,
+                         cudaStream_t stream);
+
+} // namespace layout
+
 // Intranode kernels
 namespace intranode {

@@ -69,12 +80,6 @@ namespace internode {

 int get_source_meta_bytes();

-void get_dispatch_layout(const int64_t* topk_idx,
-                         int* num_tokens_per_rank, int* num_tokens_per_rdma_rank,
-                         int* num_tokens_per_expert, bool* is_token_in_rank,
-                         int num_tokens, int num_topk, int num_ranks, int num_experts,
-                         cudaStream_t stream);
-
 void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
                     const int* num_tokens_per_rdma_rank, int* moe_recv_rdma_counter_mapped,
                     const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,