Correctly flush L2, as reconstructing the tensors on every iteration effectively put them in the L2, and gave the GPU enough idle time to avoid thermal throttling in a potentially unrealistic way.

The previous behaviour is potentially representative of some use cases (e.g. previous kernel filling L2 with the data in a very specific way) but not standard benchmarking practice.
2025-06-26 23:15:49 +00:00 · 2025-03-15 20:46:24 +00:00
parent e1c070fbef
commit 6cbff5778f
2 changed files with 35 additions and 11 deletions
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -71,10 +71,11 @@ def test_gemm() -> None:
            diff = calc_diff(out, ref_out)
            assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}'

+            # Construct new tensors only once to avoid L2 cache acceleration (creating them puts them in L2)
+            x_fp8, y_fp8, out, ref_out = construct(m, k, n)
+
            # noinspection PyShadowingNames
            def test_func():
-                # Construct new tensors every time to avoid L2 cache acceleration
-                x_fp8, y_fp8, out, ref_out = construct(m, k, n)
                deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)

            t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True)
@@ -96,12 +97,13 @@ def test_m_grouped_gemm_contiguous() -> None:
        diff = calc_diff(out, ref_out)
        assert diff < 0.001, f'm={m * num_groups}, {k=}, {n=}, {diff:.5f}'

+        # Construct new tensors only once to avoid L2 cache acceleration (creating them puts them in L2)
+        x_fp8, y_fp8, out, ref_out = construct_grouped(num_groups, m, k, n, is_masked=False)
+        m_indices = torch.arange(0, num_groups, device='cuda', dtype=torch.int)
+        m_indices = m_indices.unsqueeze(-1).expand(num_groups, m).contiguous().view(-1)
+
        # noinspection PyShadowingNames
        def test_func():
-            # Construct new tensors every time to avoid L2 cache acceleration
-            x_fp8, y_fp8, out, ref_out = construct_grouped(num_groups, m, k, n, is_masked=False)
-            m_indices = torch.arange(0, num_groups, device='cuda', dtype=torch.int)
-            m_indices = m_indices.unsqueeze(-1).expand(num_groups, m).contiguous().view(-1)
            deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(x_fp8, y_fp8, out, m_indices)

        t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True)
@@ -129,11 +131,12 @@ def test_m_grouped_gemm_masked() -> None:
                    diff = calc_diff(out[j, :masked_m[j].item()], ref_out[j, :masked_m[j].item()])
                    assert diff < 0.001, f'{m=}, {k=}, {n=}, {j=}, masked_m={masked_m[j]}, {num_groups=}, {diff:.5f}'

+            # Construct new tensors only once to avoid L2 cache acceleration (creating them puts them in L2)
+            x_fp8, y_fp8, out, ref_out = construct_grouped(num_groups, m, k, n, is_masked=True)
+            masked_m = torch.ones((num_groups, ), device='cuda', dtype=torch.int) * m
+
            # noinspection PyShadowingNames
            def test_func():
-                # Construct new tensors every time to avoid L2 cache acceleration
-                x_fp8, y_fp8, out, ref_out = construct_grouped(num_groups, m, k, n, is_masked=True)
-                masked_m = torch.ones((num_groups, ), device='cuda', dtype=torch.int) * m
                deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(x_fp8, y_fp8, out, masked_m, m)

            # Test performance with fixed shapes