mirror of
https://github.com/deepseek-ai/DeepEP
synced 2025-06-26 18:28:11 +00:00
Support Ampere architecture (#204)
* Update README * Update `setup.py` * Fix headers * Add `DISABLE_NVSHMEM` for APIs * Fix launch * Fix TMA settings * Fix TMA usages * Fix dlink * Separate layout kernels * Update version * Add `is_sm90_compiled` * Fix tests * Add NVLink connection checks * Update README * Fix tests * Add some comments * Minor fix * Minor fix * Fix bugs
This commit is contained in:
@@ -266,6 +266,9 @@ __device__ __forceinline__ void st_na_global(const int4 *ptr, const int4& value
|
||||
::"l"(ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w));
|
||||
}
|
||||
|
||||
// TMA PTX instructions
|
||||
#ifndef DISABLE_SM90_FEATURES
|
||||
|
||||
__device__ __forceinline__ void fence_view_async_shared() {
|
||||
asm volatile("fence.proxy.async.shared::cta; \n" :: );
|
||||
}
|
||||
@@ -327,6 +330,8 @@ __device__ __forceinline__ void tma_store_wait() {
|
||||
asm volatile("cp.async.bulk.wait_group.read %0;" :: "n"(N) : "memory");
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename dtype_t>
|
||||
__host__ __device__ dtype_t cell_div(dtype_t a, dtype_t b) {
|
||||
return (a + b - 1) / b;
|
||||
|
||||
Reference in New Issue
Block a user