mirror of
https://github.com/deepseek-ai/DeepGEMM
synced 2025-06-03 16:56:11 +00:00
Add CMake support for CLion indexing
This commit is contained in:
parent
327ec92f69
commit
5bda27244b
44
CMakeLists.txt
Normal file
44
CMakeLists.txt
Normal file
@ -0,0 +1,44 @@
|
||||
# NOTES: current just for CMake-based IDE (e.g. CLion) indexing, the real compilation is done via JIT
|
||||
# TODO: add CUDA utils' library via CMake
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
project(deep_gemm LANGUAGES CXX CUDA)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_CUDA_STANDARD 20)
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
find_package(pybind11 REQUIRED)
|
||||
|
||||
file(WRITE ${CMAKE_BINARY_DIR}/test_cuda.cu "extern \"C\" __global__ void testKernel() { }")
|
||||
execute_process(
|
||||
COMMAND ${CUDA_NVCC_EXECUTABLE} ${CMAKE_CUDA_FLAGS} -gencode arch=compute_90a,code=sm_90a -o ${CMAKE_BINARY_DIR}/test_cuda.o -c ${CMAKE_BINARY_DIR}/test_cuda.cu
|
||||
RESULT_VARIABLE NVCC_RESULT
|
||||
OUTPUT_VARIABLE NVCC_OUTPUT
|
||||
ERROR_VARIABLE NVCC_ERROR_OUTPUT
|
||||
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
|
||||
)
|
||||
|
||||
if (NVCC_RESULT EQUAL "0")
|
||||
set(NVCC_SUPPORTS_SM90 TRUE)
|
||||
message(STATUS "NVCC supports SM90")
|
||||
else()
|
||||
message(STATUS "NVCC does not support SM90")
|
||||
endif()
|
||||
|
||||
if (NVCC_SUPPORTS_SM90)
|
||||
set(TORCH_CUDA_ARCH_LIST "8.6" CACHE STRING "Add arch tag 90a to NVCC" FORCE)
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_90a,code=sm_90a")
|
||||
endif()
|
||||
find_package(Torch REQUIRED)
|
||||
|
||||
include_directories(deep_gemm/include third-party/cutlass/include third-party/cutlass/tools/util/include)
|
||||
include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS})
|
||||
link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib)
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -fPIC -DNDEBUG")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3 -std=c++17 -DNDEBUG --ptxas-options=--register-usage-level=10")
|
||||
|
||||
cuda_add_library(example_gemm STATIC indexing/main.cu)
|
@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wunknown-attributes"
|
||||
#pragma once
|
||||
|
||||
#include <cutlass/arch/barrier.h>
|
||||
#include <cutlass/arch/reg_reconfig.h>
|
||||
|
29
indexing/main.cu
Normal file
29
indexing/main.cu
Normal file
@ -0,0 +1,29 @@
|
||||
#include "deep_gemm/fp8_gemm.cuh"
|
||||
|
||||
using namespace deep_gemm;
|
||||
|
||||
int main() {
|
||||
int m = 128;
|
||||
constexpr int N = 4096;
|
||||
constexpr int K = 7168;
|
||||
|
||||
constexpr int BLOCK_M = 128;
|
||||
constexpr int BLOCK_N = 128;
|
||||
constexpr int BLOCK_K = 128;
|
||||
constexpr int BLOCK_N_PADDING = 0;
|
||||
constexpr int kNumGroups = 1;
|
||||
constexpr int kNumStages = 5;
|
||||
constexpr int kNumTMAMulticast = 1;
|
||||
constexpr bool kIsTMAMulticastOnA = false;
|
||||
|
||||
using gemm_t = Gemm<N, K, BLOCK_M, BLOCK_N, BLOCK_K, BLOCK_N_PADDING, kNumGroups, kNumStages, kNumTMAMulticast, kIsTMAMulticastOnA, GemmType::Normal>;
|
||||
auto tma_a_desc = gemm_t::make_2d_tma_a_desc(reinterpret_cast<__nv_fp8_e4m3*>(0), m);
|
||||
auto tma_b_desc = gemm_t::make_2d_tma_b_desc(reinterpret_cast<__nv_fp8_e4m3*>(0));
|
||||
auto tma_scales_a_desc = gemm_t::make_2d_tma_scales_a_desc(reinterpret_cast<float*>(0), m);
|
||||
auto tma_d_desc = gemm_t::make_3d_tma_d_desc(reinterpret_cast<nv_bfloat16*>(0), m);
|
||||
gemm_t::run(nullptr, nullptr, nullptr,
|
||||
m,
|
||||
tma_a_desc, tma_b_desc, tma_scales_a_desc, tma_d_desc,
|
||||
nullptr, 132, 0);
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user