mirror of
https://github.com/deepseek-ai/DeepGEMM
synced 2025-06-04 05:46:13 +00:00
Add CSV benchmark results saving feature
This commit is contained in:
parent
bd2a775528
commit
eb8e8346c8
3
.gitignore
vendored
3
.gitignore
vendored
@ -9,3 +9,6 @@ dist
|
||||
# Third-party links created by `setup.py develop`
|
||||
deep_gemm/include/cute
|
||||
deep_gemm/include/cutlass
|
||||
|
||||
|
||||
/documentation
|
24
Dockerfile
Normal file
24
Dockerfile
Normal file
@ -0,0 +1,24 @@
|
||||
FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
|
||||
|
||||
# Install required packages
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip git build-essential && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /workspace
|
||||
|
||||
# Copy DeepGEMM into container
|
||||
COPY . /workspace/DeepGEMM
|
||||
|
||||
# Install Python dependencies (if any, e.g., torch)
|
||||
RUN python3 -m pip install --upgrade pip setuptools
|
||||
RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
||||
RUN python3 -m pip install -e /workspace/DeepGEMM
|
||||
|
||||
# Environment variables (optional)
|
||||
ENV NVIDIA_VISIBLE_DEVICES all
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
|
||||
|
||||
# Default command
|
||||
CMD ["/bin/bash"]
|
2
scripts/docker-build.sh
Executable file
2
scripts/docker-build.sh
Executable file
@ -0,0 +1,2 @@
|
||||
#!/bin/bash
|
||||
docker build -t deepgemm-test:latest -f Dockerfile .
|
@ -1,6 +1,9 @@
|
||||
import random
|
||||
import torch
|
||||
from typing import Tuple
|
||||
import csv
|
||||
import random
|
||||
|
||||
|
||||
|
||||
import deep_gemm
|
||||
from deep_gemm import bench_kineto, calc_diff, ceil_div, get_col_major_tma_aligned_tensor
|
||||
@ -143,16 +146,42 @@ def test_m_grouped_gemm_masked() -> None:
|
||||
f'{(num_groups * (m * k + k * n + m * n * 2)) / 1e9 / t:4.0f} GB/s')
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
if __name__ == "__main__":
|
||||
torch.backends.cudnn.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
torch.manual_seed(0)
|
||||
random.seed(0)
|
||||
|
||||
print('Library path:')
|
||||
print(f' > {deep_gemm.__path__}\n')
|
||||
|
||||
test_gemm()
|
||||
test_m_grouped_gemm_contiguous()
|
||||
test_m_grouped_gemm_masked()
|
||||
results = []
|
||||
|
||||
# Collect GEMM benchmarks
|
||||
print('Testing GEMM:')
|
||||
for m in (64, 128, 4096):
|
||||
for k, n in [(7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)]:
|
||||
x_fp8, y_fp8, out, ref_out = construct(m, k, n)
|
||||
deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)
|
||||
diff = calc_diff(out, ref_out)
|
||||
assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}'
|
||||
|
||||
def test_func():
|
||||
x_fp8, y_fp8, out, ref_out = construct(m, k, n)
|
||||
deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)
|
||||
|
||||
t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True)
|
||||
tflops = 2 * m * n * k / t / 1e12
|
||||
bandwidth = (m * k + k * n + m * n * 2) / t / 1e9
|
||||
time_us = t * 1e6
|
||||
print(f' > Performance (m={m:5}, n={n:5}, k={k:5}): {time_us:.2f} us | throughput: {tflops:.2f} TFLOPS, {bandwidth:.2f} GB/s')
|
||||
|
||||
results.append([m, n, k, round(time_us, 2), round(tflops, 2), round(bandwidth, 2)])
|
||||
|
||||
# Save results once after all benchmarks
|
||||
with open('results.csv', 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(['M', 'N', 'K', 'Latency_us', 'TFLOPS', 'Bandwidth_GBps']) # Header
|
||||
writer.writerows(results)
|
||||
|
||||
print("Benchmarks saved to results.csv")
|
||||
|
Loading…
Reference in New Issue
Block a user