Update some comments and docs

This commit is contained in:
Chenggang Zhao
2025-02-27 10:27:22 +08:00
parent 3885404ffb
commit 77bb07aa20
3 changed files with 12 additions and 8 deletions

View File

@@ -378,7 +378,7 @@ combine(void* combined_x,
atomic_add_release_global(atomic_clean_flag, num_experts);
}
// FP8 cast and issue IBGDA sends
// Issue IBGDA sends
if (responsible_expert_idx < num_experts) {
const auto dst_rank = responsible_expert_idx / num_local_experts;
const auto local_expert_idx = responsible_expert_idx % num_local_experts;

View File

@@ -148,9 +148,7 @@ __device__ __forceinline__ int64_t ld_volatile_global(const uint64_t *ptr) {
#define LD_NC_FUNC "ld.volatile.global"
#endif
// `ld.global.nc.L1::no_allocate` will be translated into `LDG.E.NA.[width].CONSTANT` in SASS,
// which does not have cache allocation, and `CONSTANT` memory does not have coherence control,
// so we have to control them by queue semantics
// `ld.global.nc.L1::no_allocate` will be translated into `LDG.E.NA.[width].CONSTANT` in SASS
template <typename dtype_t>
__device__ __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
auto ret = ld_nc_global(reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t*>(ptr));
@@ -234,8 +232,7 @@ __device__ __forceinline__ void st_na_release(const uint64_t *ptr, uint64_t val)
asm volatile("st.release.gpu.global.L1::no_allocate.b64 [%0], %1;" : : "l"(ptr), "l"(val));
}
// `st.global.L1::no_allocate` will be translated into `ST.E.NA.[width]` in SASS,
// which does not have cache allocation (obviously in L1, I guess not in L2 too)
// `st.global.L1::no_allocate` will be translated into `ST.E.NA.[width]` in SASS
#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
#define ST_NA_FUNC "st.global.L1::no_allocate"
#else