diff --git a/.gitignore b/.gitignore index 3e6e4e5..eabc477 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ dist # Third-party links created by `setup.py develop` deep_gemm/include/cute deep_gemm/include/cutlass + + +/documentation \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..0c8ceb1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +FROM nvidia/cuda:12.3.1-devel-ubuntu22.04 + +# Install required packages +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip git build-essential && \ + rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /workspace + +# Copy DeepGEMM into container +COPY . /workspace/DeepGEMM + +# Install Python dependencies (if any, e.g., torch) +RUN python3 -m pip install --upgrade pip setuptools +RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 +RUN python3 -m pip install -e /workspace/DeepGEMM + +# Environment variables (optional) +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility + +# Default command +CMD ["/bin/bash"] diff --git a/scripts/docker-build.sh b/scripts/docker-build.sh new file mode 100755 index 0000000..5de91ab --- /dev/null +++ b/scripts/docker-build.sh @@ -0,0 +1,2 @@ +#!/bin/bash +docker build -t deepgemm-test:latest -f Dockerfile . diff --git a/tests/test_core.py b/tests/test_core.py index 68d9b79..e961a7b 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,6 +1,9 @@ -import random import torch from typing import Tuple +import csv +import random + + import deep_gemm from deep_gemm import bench_kineto, calc_diff, ceil_div, get_col_major_tma_aligned_tensor @@ -143,16 +146,42 @@ def test_m_grouped_gemm_masked() -> None: f'{(num_groups * (m * k + k * n + m * n * 2)) / 1e9 / t:4.0f} GB/s') print() - -if __name__ == '__main__': - torch.backends.cuda.matmul.allow_tf32 = True +if __name__ == "__main__": torch.backends.cudnn.allow_tf32 = True + torch.backends.cuda.matmul.allow_tf32 = True torch.manual_seed(0) random.seed(0) print('Library path:') print(f' > {deep_gemm.__path__}\n') - test_gemm() - test_m_grouped_gemm_contiguous() - test_m_grouped_gemm_masked() + results = [] + + # Collect GEMM benchmarks + print('Testing GEMM:') + for m in (64, 128, 4096): + for k, n in [(7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)]: + x_fp8, y_fp8, out, ref_out = construct(m, k, n) + deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out) + diff = calc_diff(out, ref_out) + assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}' + + def test_func(): + x_fp8, y_fp8, out, ref_out = construct(m, k, n) + deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out) + + t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) + tflops = 2 * m * n * k / t / 1e12 + bandwidth = (m * k + k * n + m * n * 2) / t / 1e9 + time_us = t * 1e6 + print(f' > Performance (m={m:5}, n={n:5}, k={k:5}): {time_us:.2f} us | throughput: {tflops:.2f} TFLOPS, {bandwidth:.2f} GB/s') + + results.append([m, n, k, round(time_us, 2), round(tflops, 2), round(bandwidth, 2)]) + + # Save results once after all benchmarks + with open('results.csv', 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(['M', 'N', 'K', 'Latency_us', 'TFLOPS', 'Bandwidth_GBps']) # Header + writer.writerows(results) + + print("Benchmarks saved to results.csv")