diff --git a/csrc/kernels/utils.cuh b/csrc/kernels/utils.cuh index 98296dc..5c4f2ba 100644 --- a/csrc/kernels/utils.cuh +++ b/csrc/kernels/utils.cuh @@ -309,7 +309,7 @@ __device__ __forceinline__ void tma_load_1d(const void* smem_ptr, const void* gm auto mbar_int_ptr = static_cast(__cvta_generic_to_shared(mbar_ptr)); auto smem_int_ptr = static_cast(__cvta_generic_to_shared(smem_ptr)); const auto cache_hint = evict_first ? kEvictFirst : kEvictNormal; - asm volatile("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%0], [%1], %2, [%3], %4;\n" + asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%0], [%1], %2, [%3], %4;\n" :: "r"(smem_int_ptr), "l"(gmem_ptr), "r"(num_bytes), "r"(mbar_int_ptr), "l"(cache_hint) : "memory"); }