Add automatic warp count control for low-latency kernels (#213)

* Add automatic warp count control for low-latency dispatch

* Add automatic warp count control for low-latency combine

* More assertions
This commit is contained in:
Chenggang Zhao
2025-06-16 11:56:43 +08:00
committed by GitHub
parent 4e923188f7
commit 1b92be8a71
6 changed files with 83 additions and 65 deletions

View File

@@ -6,13 +6,13 @@
namespace deep_ep {
template <typename dtype_t>
dtype_t cell_div(dtype_t a, dtype_t b) {
dtype_t ceil_div(dtype_t a, dtype_t b) {
return (a + b - 1) / b;
}
template <typename dtype_t>
dtype_t align(dtype_t a, dtype_t b) {
return cell_div<dtype_t>(a, b) * b;
return ceil_div<dtype_t>(a, b) * b;
}
struct Config {