Performance: BlockTile 256x128 optimizations enable 1500+ TFLOPS FP8 performance on the H800-SXM platform

This commit is contained in:
sazc
2025-04-08 17:42:23 +08:00
parent b4ecf9c3ff
commit 97575bf1c6
3 changed files with 168 additions and 12 deletions

View File

@@ -74,10 +74,16 @@ def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int,
Tuple[int, int, int, int, Tuple[int, bool], int]:
if not is_grouped_contiguous:
# TODO: for some cases, smaller M block is better, add them into tuning space
block_ms = (64 if m <= 64 else 128, )
# block_ms = (64 if m <= 64 else 128, )
if m <= 64:
block_ms = (64, )
elif m <= 128:
block_ms = (64, 128, )
else:
block_ms = (64, 128, 256, )
else:
block_ms = (get_m_alignment_for_contiguous_layout(), )
block_ns = tuple(range(16, 129, 8)) + (144, 160, )
block_ns = tuple(range(16, 129, 8))
fix_wave_saturate = lambda x: num_sms if x == 0 else x
get_num_waves = lambda bm, bn: (ceil_div(ceil_div(m, bm) * ceil_div(n, bn) * num_groups, num_sms) if bm else None)