#pragma once #include "params.h" template void run_flash_mla_combine_kernel(Flash_fwd_mla_params ¶ms, cudaStream_t stream);