Performance optimization for compute-bound cases

This commit is contained in:
Shengyu Liu
2025-04-21 17:22:59 +08:00
parent 063ffa8ec1
commit 287061ec34
20 changed files with 1799 additions and 1217 deletions

View File

@@ -55,7 +55,6 @@ def flash_mla_with_kvcache(
out, softmax_lse = flash_mla_cuda.fwd_kvcache_mla(
q,
k_cache,
None,
head_dim_v,
cache_seqlens,
block_table,