mirror of
https://github.com/deepseek-ai/FlashMLA
synced 2025-05-14 08:41:09 +00:00
* Fix benchmark script * Performance optimization for compute-bound cases * Add new testcase (s_k = 16384) * Update README.md * Update comment * Update README.md * Add the deep-dive blog * Add background color for MLA Kernel Sched.drawio.svg * Use relative path for the schedule image * Move flash_mla.h to kernels/params.h
33 lines
2.2 KiB
C
33 lines
2.2 KiB
C
#pragma once
|
|
|
|
#define CHECK_CUDA(call) \
|
|
do { \
|
|
cudaError_t status_ = call; \
|
|
if (status_ != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \
|
|
exit(1); \
|
|
} \
|
|
} while(0)
|
|
|
|
#define CHECK_CUDA_KERNEL_LAUNCH() CHECK_CUDA(cudaGetLastError())
|
|
|
|
|
|
#define FLASH_ASSERT(cond) \
|
|
do { \
|
|
if (not (cond)) { \
|
|
fprintf(stderr, "Assertion failed (%s:%d): %s\n", __FILE__, __LINE__, #cond); \
|
|
exit(1); \
|
|
} \
|
|
} while(0)
|
|
|
|
|
|
#define FLASH_DEVICE_ASSERT(cond) \
|
|
do { \
|
|
if (not (cond)) { \
|
|
printf("Assertion failed (%s:%d): %s\n", __FILE__, __LINE__, #cond); \
|
|
asm("trap;"); \
|
|
} \
|
|
} while(0)
|
|
|
|
#define println(fmt, ...) { print(fmt, ##__VA_ARGS__); print("\n"); }
|