diff --git a/.gitignore b/.gitignore
index 3e6e4e5..eabc477 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,6 @@ dist
 # Third-party links created by `setup.py develop`
 deep_gemm/include/cute
 deep_gemm/include/cutlass
+
+
+/documentation
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..0c8ceb1
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,24 @@
+FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
+
+# Install required packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-pip git build-essential && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /workspace
+
+# Copy DeepGEMM into container
+COPY . /workspace/DeepGEMM
+
+# Install Python dependencies (if any, e.g., torch)
+RUN python3 -m pip install --upgrade pip setuptools
+RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+RUN python3 -m pip install -e /workspace/DeepGEMM
+
+# Environment variables (optional)
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+
+# Default command
+CMD ["/bin/bash"]
diff --git a/scripts/docker-build.sh b/scripts/docker-build.sh
new file mode 100755
index 0000000..5de91ab
--- /dev/null
+++ b/scripts/docker-build.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker build -t deepgemm-test:latest -f Dockerfile .
diff --git a/tests/test_core.py b/tests/test_core.py
index 68d9b79..e961a7b 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -1,6 +1,9 @@
-import random
 import torch
 from typing import Tuple
+import csv
+import random
+
+
 
 import deep_gemm
 from deep_gemm import bench_kineto, calc_diff, ceil_div, get_col_major_tma_aligned_tensor
@@ -143,16 +146,42 @@ def test_m_grouped_gemm_masked() -> None:
                   f'{(num_groups * (m * k + k * n + m * n * 2)) / 1e9 / t:4.0f} GB/s')
     print()
 
-
-if __name__ == '__main__':
-    torch.backends.cuda.matmul.allow_tf32 = True
+if __name__ == "__main__":
     torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
     torch.manual_seed(0)
     random.seed(0)
 
     print('Library path:')
     print(f' > {deep_gemm.__path__}\n')
 
-    test_gemm()
-    test_m_grouped_gemm_contiguous()
-    test_m_grouped_gemm_masked()
+    results = []
+
+    # Collect GEMM benchmarks
+    print('Testing GEMM:')
+    for m in (64, 128, 4096):
+        for k, n in [(7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)]:
+            x_fp8, y_fp8, out, ref_out = construct(m, k, n)
+            deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)
+            diff = calc_diff(out, ref_out)
+            assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}'
+
+            def test_func():
+                x_fp8, y_fp8, out, ref_out = construct(m, k, n)
+                deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out)
+
+            t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True)
+            tflops = 2 * m * n * k / t / 1e12
+            bandwidth = (m * k + k * n + m * n * 2) / t / 1e9
+            time_us = t * 1e6
+            print(f' > Performance (m={m:5}, n={n:5}, k={k:5}): {time_us:.2f} us | throughput: {tflops:.2f} TFLOPS, {bandwidth:.2f} GB/s')
+
+            results.append([m, n, k, round(time_us, 2), round(tflops, 2), round(bandwidth, 2)])
+
+    # Save results once after all benchmarks
+    with open('results.csv', 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(['M', 'N', 'K', 'Latency_us', 'TFLOPS', 'Bandwidth_GBps'])  # Header
+        writer.writerows(results)
+
+    print("Benchmarks saved to results.csv")