Use TMA instead of LD/ST for intra-node normal kernels (#191)

* Update CMake files

* Use TMA instead of LD/ST for intranode dispatch

* Use TMA instead of LD/ST for intranode combine

* Adjust configs

* Test default configs as well

* More warps for combine

* Add inter-thread fence

* Enable more warps

* Do not use TMA for senders

* Update configs

* Remove useless wait
This commit is contained in:
Chenggang Zhao
2025-06-06 15:40:17 +08:00
committed by GitHub
parent df4debe30c
commit c8dceba110
6 changed files with 230 additions and 87 deletions

View File

@@ -171,8 +171,8 @@ class Buffer:
"""
config_map = {
2: Config(Buffer.num_sms, 16, 256, 6, 128),
4: Config(Buffer.num_sms, 16, 256, 6, 128),
2: Config(Buffer.num_sms, 24, 256, 6, 128),
4: Config(Buffer.num_sms, 6, 256, 6, 128),
8: Config(Buffer.num_sms, 6, 256, 6, 128),
16: Config(Buffer.num_sms, 16, 288, 20, 128),
24: Config(Buffer.num_sms, 8, 288, 32, 128),
@@ -198,9 +198,9 @@ class Buffer:
"""
config_map = {
2: Config(Buffer.num_sms, 6, 256, 6, 128),
4: Config(Buffer.num_sms, 6, 256, 6, 128),
8: Config(Buffer.num_sms, 6, 256, 6, 128),
2: Config(Buffer.num_sms, 10, 256, 6, 128),
4: Config(Buffer.num_sms, 9, 256, 6, 128),
8: Config(Buffer.num_sms, 4, 256, 6, 128),
16: Config(Buffer.num_sms, 2, 288, 28, 128),
24: Config(Buffer.num_sms, 1, 288, 20, 128),
32: Config(Buffer.num_sms, 1, 288, 20, 128),