Add CSV benchmark results saving feature

This commit is contained in:
Derek Rosenzweig 2025-03-13 20:02:24 -07:00
parent bd2a775528
commit eb8e8346c8
4 changed files with 65 additions and 7 deletions

3
.gitignore vendored
View File

@ -9,3 +9,6 @@ dist
# Third-party links created by `setup.py develop`
deep_gemm/include/cute
deep_gemm/include/cutlass
/documentation

24
Dockerfile Normal file
View File

@ -0,0 +1,24 @@
FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
# Install required packages
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip git build-essential && \
rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /workspace
# Copy DeepGEMM into container
COPY . /workspace/DeepGEMM
# Install Python dependencies (if any, e.g., torch)
RUN python3 -m pip install --upgrade pip setuptools
RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
RUN python3 -m pip install -e /workspace/DeepGEMM
# Environment variables (optional)
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
# Default command
CMD ["/bin/bash"]

2
scripts/docker-build.sh Executable file
View File

@ -0,0 +1,2 @@
#!/bin/bash
docker build -t deepgemm-test:latest -f Dockerfile .

View File

@ -1,6 +1,9 @@
import random
import torch
from typing import Tuple
import csv
import random
import deep_gemm
from deep_gemm import bench_kineto, calc_diff, ceil_div, get_col_major_tma_aligned_tensor
@ -143,16 +146,42 @@ def test_m_grouped_gemm_masked() -> None:
f'{(num_groups * (m * k + k * n + m * n * 2)) / 1e9 / t:4.0f} GB/s')
print()
if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = True
if __name__ == "__main__":
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.manual_seed(0)
random.seed(0)
print('Library path:')
print(f' > {deep_gemm.__path__}\n')
test_gemm()
test_m_grouped_gemm_contiguous()
test_m_grouped_gemm_masked()
results = []
# Collect GEMM benchmarks
print('Testing GEMM:')
for m in (64, 128, 4096):
for k, n in [(7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)]:
x_fp8, y_fp8, out, ref_out = construct(m, k, n)
deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)
diff = calc_diff(out, ref_out)
assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}'
def test_func():
x_fp8, y_fp8, out, ref_out = construct(m, k, n)
deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)
t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True)
tflops = 2 * m * n * k / t / 1e12
bandwidth = (m * k + k * n + m * n * 2) / t / 1e9
time_us = t * 1e6
print(f' > Performance (m={m:5}, n={n:5}, k={k:5}): {time_us:.2f} us | throughput: {tflops:.2f} TFLOPS, {bandwidth:.2f} GB/s')
results.append([m, n, k, round(time_us, 2), round(tflops, 2), round(bandwidth, 2)])
# Save results once after all benchmarks
with open('results.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['M', 'N', 'K', 'Latency_us', 'TFLOPS', 'Bandwidth_GBps']) # Header
writer.writerows(results)
print("Benchmarks saved to results.csv")