gpu_burn 是一款专为多 GPU 设计的、通过 CUDA 实现高强度压力测试的工具。它旨在帮助系统管理员、研究人员和硬件发烧友深入了解GPU的潜能。
gpu_burn是一个开源项目,其源码结构简洁明了,支持快速构建和自定义配置。
gpu_burn 整体代码结构比较简单,其核心代码在 1000行左右;
核心头文件:
#include "cublas_v2.h" // cublas 库文件
#include <cuda.h> // cuda_driver_api 驱动库文件
gpu_burn 使用的是 cuda_driver_api 中的函数;没有调用 cuda_runtime_api 中的函数;在编译 .cu 文件需要用到 nvcc 编译工具生成 ptx 文件;
int runLength = 10; // 压测时长,单位为 秒
bool useDoubles = false; // 是否使用双精度
bool useTensorCores = false; // 是否使用 tensorRT 核
int thisParam = 0; // 参数个数
ssize_t useBytes = 0; // 0 == use USEMEM% of free mem 分配使用的内存字节数
int device_id = -1; // GPU 设备编号,默认从 0 开始
char *kernelFile = (char *)COMPARE_KERNEL; // 默认使用的 ptx 文件
#define COMPARE_KERNEL "compare.ptx" // 默认编译生成的 ptx 文件名,可以通过参数指定
GPU Burn
Usage: gpu_burn [OPTIONS] [TIME]
-m X Use X MB of memory
-m N% Use N% of the available GPU memory
-d Use doubles
-tc Try to use Tensor cores (if available)
-l List all GPUs in the system
-i N Execute only on GPU N
-c FILE Use FILE as compare kernel. Default is compare.ptx
-stts T Set timeout threshold to T seconds for using SIGTERM to abort child processes before using SIGKILL. Default is 30
-h Show this help message
Example:
gpu-burn -d 3600 # burns all GPUs with doubles for an hour
gpu-burn -m 50% # burns using 50% of the available GPU memory
gpu-burn -l # list GPUs
gpu-burn -i 2 # burns only GPU of index 2
int count = initCuda(); // 初始化 cuda,调用 gpu 驱动函数之前需要执行,并返回 cuda 设备数量
if (count == 0) {
throw std::runtime_error("No CUDA capable GPUs found.\n");
}
for (int i_dev = 0; i_dev < count; i_dev++) {
CUdevice device_l;
CUdevprop device_p;
char device_name[255]; // GPU 设备名称
checkError(cuDeviceGet(&device_l, i_dev));
checkError(cuDeviceGetName(device_name, 255, device_l));
checkError(cuDeviceGetProperties(&device_p, device_l));
size_t device_mem_l;
checkError(cuDeviceTotalMem(&device_mem_l, device_l));
printf("ID %i: %s, %ldMB\n", i_dev, device_name, device_mem_l / 1000 / 1000);
}
typedef struct CUdevprop_st {
int maxThreadsPerBlock;
int maxThreadsDim[3];
int maxGridSize[3];
int sharedMemPerBlock; // 每个块的共享内存大小
int totalConstantMemory; // 常量内存大小
int SIMDWidth; // SIMD Width(单指令多数据宽度)
int memPitch;
int regsPerBlock;
int clockRate;
int textureAlign
} CUdevprop;
参数解析完成后,调用 lanch() 函数:
if (useDoubles)
launch<double>(runLength, useDoubles, useTensorCores, useBytes,
device_id, kernelFile, sigterm_timeout_threshold_secs);
else
launch<float>(runLength, useDoubles, useTensorCores, useBytes,
device_id, kernelFile, sigterm_timeout_threshold_secs);
template <class T>
void launch(int runLength, bool useDoubles, bool useTensorCores,
ssize_t useBytes, int device_id, const char * kernelFile,
std::chrono::seconds sigterm_timeout_threshold_secs)
// Initting A and B with random data
T *A = (T *)malloc(sizeof(T) * SIZE * SIZE);
T *B = (T *)malloc(sizeof(T) * SIZE * SIZE);
srand(10);
for (size_t i = 0; i < SIZE * SIZE; ++i) {
A[i] = (T)((double)(rand() % 1000000) / 100000.0);
B[i] = (T)((double)(rand() % 1000000) / 100000.0);
}
int mainPipe[2];
pipe(mainPipe);
为了简化流程,这里流程就介绍单 GPU 卡的压测流程,对于多 GPU 卡的压测是类似的,留给有兴趣的读者去探索;
pid_t myPid = fork();
if (!myPid) {
// Child
close(mainPipe[0]);
int writeFd = mainPipe[1];
initCuda();
int devCount = 1;
write(writeFd, &devCount, sizeof(int));
startBurn<T>(device_id, writeFd, A, B, useDoubles, useTensorCores,
useBytes, kernelFile);
close(writeFd);
return;
} else {
clientPids.push_back(myPid);
close(mainPipe[1]);
int devCount;
read(readMain, &devCount, sizeof(int));
listenClients(clientPipes, clientPids, runLength, sigterm_timeout_threshold_secs);
}
for (size_t i = 0; i < clientPipes.size(); ++i)
close(clientPipes.at(i));
template <class T>
void startBurn(int index, int writeFd, T *A, T *B, bool doubles, bool tensors,
ssize_t useBytes, const char *kernelFile)
int eventIndex = 0;
const int maxEvents = 2;
CUevent events[maxEvents];
for (int i = 0; i < maxEvents; ++i)
cuEventCreate(events + i, 0);
int nonWorkIters = maxEvents;
while (cuEventQuery(events[eventIndex]) != CUDA_SUCCESS)
usleep(1000); // gpu没有执行没有完成,cpu 休眠 1000us
template <class T> class GPU_Test {}
bool d_doubles; // 是否使用双精度
bool d_tensors; // 是否使用 tensor core
int d_devNumber; // GPU 设备数量
const char *d_kernelFile; // 核函数文件名(ptx 文件名)
size_t d_iters; // 需要运算的次数
size_t d_resultSize; // 矩阵 C 的字节数
long long int d_error; // 错误码
static const int g_blockSize = 16; // block 的大小
CUdevice d_dev;
CUcontext d_ctx;
CUmodule d_module;
CUfunction d_function;
CUdeviceptr d_Cdata;
CUdeviceptr d_Adata;
CUdeviceptr d_Bdata;
CUdeviceptr d_faultyElemData;
int *d_faultyElemsHost;
cublasHandle_t d_cublas;
checkError(cuDeviceGet(&d_dev, d_devNumber));
checkError(cuCtxCreate(&d_ctx, 0, d_dev));
bind();
// checkError(cublasInit());
checkError(cublasCreate(&d_cublas), "init");
if (d_tensors)
checkError(cublasSetMathMode(d_cublas, CUBLAS_TENSOR_OP_MATH));
checkError(cuMemAllocHost((void **)&d_faultyElemsHost, sizeof(int)));
d_error = 0;
g_running = true;
struct sigaction action;
memset(&action, 0, sizeof(struct sigaction));
action.sa_handler = termHandler;
sigaction(SIGTERM, &action, NULL);
bind();
checkError(cuMemFree(d_Cdata), "Free A");
checkError(cuMemFree(d_Adata), "Free B");
checkError(cuMemFree(d_Bdata), "Free C");
cuMemFreeHost(d_faultyElemsHost);
printf("Freed memory for dev %d\n", d_devNumber);
cublasDestroy(d_cublas);
printf("Uninitted cublas\n");
static void termHandler(int signum) { g_running = false; }
if (*d_faultyElemsHost) {
d_error += (long long int)*d_faultyElemsHost;
}
unsigned long long int tempErrs = d_error;
d_error = 0; // 每次计算完,对 d_error 进行清零
return tempErrs;
size_t getIters() { return d_iters; }
size_t d_resultSize = sizeof(T) * SIZE * SIZE;
d_iters = (useBytes - 2 * d_resultSize) /
d_resultSize; // We remove A and B sizes
void bind() { checkError(cuCtxSetCurrent(d_ctx), "Bind CTX"); }
bind();
size_t freeMem, totalMem;
checkError(cuMemGetInfo(&freeMem, &totalMem));
bind();
if (useBytes == 0)
useBytes = (ssize_t)((double)availMemory() * USEMEM);
if (useBytes < 0)
useBytes = (ssize_t)((double)availMemory() * (-useBytes / 100.0));
size_t d_resultSize = sizeof(T) * SIZE * SIZE;
d_iters = (useBytes - 2 * d_resultSize) /
d_resultSize; // We remove A and B sizes
if ((size_t)useBytes < 3 * d_resultSize)
throw std::string("Low mem for result. aborting.\n");
checkError(cuMemAlloc(&d_Cdata, d_iters * d_resultSize), "C alloc");
checkError(cuMemAlloc(&d_Adata, d_resultSize), "A alloc");
checkError(cuMemAlloc(&d_Bdata, d_resultSize), "B alloc");
checkError(cuMemAlloc(&d_faultyElemData, sizeof(int)), "faulty data");
// Populating matrices A and B
checkError(cuMemcpyHtoD(d_Adata, A, d_resultSize), "A -> device");
checkError(cuMemcpyHtoD(d_Bdata, B, d_resultSize), "B -> device");
initCompareKernel();
bind();
static const float alpha = 1.0f;
static const float beta = 0.0f;
static const double alphaD = 1.0;
static const double betaD = 0.0;
// C = α A B +β C
for (size_t i = 0; i < d_iters; ++i) {
if (d_doubles)
checkError(
cublasDgemm(d_cublas, CUBLAS_OP_N, CUBLAS_OP_N, SIZE, SIZE,
SIZE, &alphaD, (const double *)d_Adata, SIZE,
(const double *)d_Bdata, SIZE, &betaD,
(double *)d_Cdata + i * SIZE * SIZE, SIZE),
"DGEMM");
else
checkError(
cublasSgemm(d_cublas, CUBLAS_OP_N, CUBLAS_OP_N, SIZE, SIZE,
SIZE, &alpha, (const float *)d_Adata, SIZE,
(const float *)d_Bdata, SIZE, &beta,
(float *)d_Cdata + i * SIZE * SIZE, SIZE),
"SGEMM");
}
{
std::ifstream f(d_kernelFile);
checkError(f.good() ? CUDA_SUCCESS : CUDA_ERROR_NOT_FOUND,
std::string("couldn't find compare kernel: ") + d_kernelFile);
}
checkError(cuModuleLoad(&d_module, d_kernelFile), "load module");
checkError(cuModuleGetFunction(&d_function, d_module,
d_doubles ? "compareD" : "compare"),
"get func");
checkError(cuFuncSetCacheConfig(d_function, CU_FUNC_CACHE_PREFER_L1),
"L1 config");
checkError(cuParamSetSize(d_function, __alignof(T *) +
__alignof(int *) +
__alignof(size_t)),
"set param size");
checkError(cuParamSetv(d_function, 0, &d_Cdata, sizeof(T *)),
"set param");
checkError(cuParamSetv(d_function, __alignof(T *), &d_faultyElemData,
sizeof(T *)),
"set param");
checkError(cuParamSetv(d_function, __alignof(T *) + __alignof(int *),
&d_iters, sizeof(size_t)),
"set param");
checkError(cuFuncSetBlockShape(d_function, g_blockSize, g_blockSize, 1),
"set block size");
checkError(cuMemsetD32Async(d_faultyElemData, 0, 1, 0), "memset");
checkError(cuLaunchGridAsync(d_function, SIZE / g_blockSize,
SIZE / g_blockSize, 0),
"Launch grid");
checkError(cuMemcpyDtoHAsync(d_faultyElemsHost, d_faultyElemData,
sizeof(int), 0),
"Read faultyelemdata");
bool shouldRun() { return g_running; }
void listenClients(std::vector<int> clientFd, std::vector<pid_t> clientPid,
int runTime, std::chrono::seconds sigterm_timeout_threshold_secs)
fd_set waitHandles;
pid_t tempPid;
int tempHandle = pollTemp(&tempPid);
int maxHandle = tempHandle;
FD_ZERO(&waitHandles);
FD_SET(tempHandle, &waitHandles);
for (size_t i = 0; i < clientFd.size(); ++i) {
if (clientFd.at(i) > maxHandle)
maxHandle = clientFd.at(i);
FD_SET(clientFd.at(i), &waitHandles);
}
FD_SET(int fd, fd_set fdset); //将fd加入set集合 FD_CLR(int fd, fd_set fdset); //将fd从set集合中清除 FD_ISSET(int fd, fd_set fdset); //检测fd是否在set集合中,不在则返回0 FD_ZERO(fd_set fdset); //将set清零使集合中不含任何fd
while (
(changeCount = select(maxHandle + 1, &waitHandles, NULL, NULL, NULL))) {
size_t thisTime = time(0);
struct timespec thisTimeSpec;
clock_gettime(CLOCK_REALTIME, &thisTimeSpec);
// Going through all descriptors
for (size_t i = 0; i < clientFd.size(); ++i)
if (FD_ISSET(clientFd.at(i), &waitHandles)) {
// First, reading processed
int processed, errors;
int res = read(clientFd.at(i), &processed, sizeof(int));
if (res < sizeof(int)) {
fprintf(stderr, "read[%zu] error %d", i, res);
processed = -1;
}
// Then errors
read(clientFd.at(i), &errors, sizeof(int));
clientErrors.at(i) += errors;
if (processed == -1)
clientCalcs.at(i) = -1;
else {
double flops = (double)processed * (double)OPS_PER_MUL;
struct timespec clientPrevTime = clientUpdateTime.at(i);
double clientTimeDelta =
(double)thisTimeSpec.tv_sec +
(double)thisTimeSpec.tv_nsec / 1000000000.0 -
((double)clientPrevTime.tv_sec +
(double)clientPrevTime.tv_nsec / 1000000000.0);
clientUpdateTime.at(i) = thisTimeSpec;
clientGflops.at(i) =
(double)((unsigned long long int)processed *
OPS_PER_MUL) /
clientTimeDelta / 1000.0 / 1000.0 / 1000.0;
clientCalcs.at(i) += processed;
}
childReport = true;
}
if (FD_ISSET(tempHandle, &waitHandles))
updateTemps(tempHandle, &clientTemp);
// Resetting the listeners
FD_ZERO(&waitHandles);
FD_SET(tempHandle, &waitHandles);
for (size_t i = 0; i < clientFd.size(); ++i)
FD_SET(clientFd.at(i), &waitHandles);
// Printing progress (if a child has initted already)
if (childReport) {
float elapsed =
fminf((float)(thisTime - startTime) / (float)runTime * 100.0f,
100.0f);
printf("\r%.1f%% ", elapsed);
printf("proc'd: ");
for (size_t i = 0; i < clientCalcs.size(); ++i) {
printf("%d (%.0f Gflop/s) ", clientCalcs.at(i),
clientGflops.at(i));
if (i != clientCalcs.size() - 1)
printf("- ");
}
printf(" errors: ");
for (size_t i = 0; i < clientErrors.size(); ++i) {
std::string note = "%d ";
if (clientCalcs.at(i) == -1)
note += " (DIED!)";
else if (clientErrors.at(i))
note += " (WARNING!)";
printf(note.c_str(), clientErrors.at(i));
if (i != clientCalcs.size() - 1)
printf("- ");
}
printf(" temps: ");
for (size_t i = 0; i < clientTemp.size(); ++i) {
printf(clientTemp.at(i) != 0 ? "%d C " : "-- ",
clientTemp.at(i));
if (i != clientCalcs.size() - 1)
printf("- ");
}
fflush(stdout);
for (size_t i = 0; i < clientErrors.size(); ++i)
if (clientErrors.at(i))
clientFaulty.at(i) = true;
if (nextReport < elapsed) {
nextReport = elapsed + 10.0f;
printf("\n\tSummary at: ");
fflush(stdout);
system("date"); // Printing a date
fflush(stdout);
printf("\n");
for (size_t i = 0; i < clientErrors.size(); ++i)
clientErrors.at(i) = 0;
}
}
// Checking whether all clients are dead
bool oneAlive = false;
for (size_t i = 0; i < clientCalcs.size(); ++i)
if (clientCalcs.at(i) != -1)
oneAlive = true;
if (!oneAlive) {
fprintf(stderr, "\n\nNo clients are alive! Aborting\n");
exit(ENOMEDIUM);
}
if (startTime + runTime < thisTime)
break;
}
//#define OPS_PER_MUL 17188257792ul // Measured for SIZE = 2048 #define OPS_PER_MUL 1100048498688ul // Extrapolated for SIZE = 8192
int tempPipe[2];
pipe(tempPipe);
pid_t myPid = fork();
if (!myPid) {
close(tempPipe[0]);
dup2(tempPipe[1], STDOUT_FILENO);
#if IS_JETSON
execlp("tegrastats", "tegrastats", "--interval", "5000", NULL);
fprintf(stderr, "Could not invoke tegrastats, no temps available\n");
#else
execlp("nvidia-smi", "nvidia-smi", "-l", "5", "-q", "-d", "TEMPERATURE",
NULL);
fprintf(stderr, "Could not invoke nvidia-smi, no temps available\n");
#endif
exit(ENODEV);
}
*p = myPid;
close(tempPipe[1]);
return tempPipe[0];
pollTemp() 通过 fork() 创建子进程,然后通过命令读取 GPU 设备的温度信息,再通过管道返回给主进程;
11-25-2024 17:51:24 RAM 6633/31003MB (lfb 4855x4MB) SWAP 30/15501MB (cached 0MB) CPU [10%@1190,5%@1190,3%@1190,3%@1191,6%@1190,8%@1189,3%@1190,5%@1190] EMC_FREQ 1%@800 GR3D_FREQ 0%@114 VIC_FREQ 115 APE 150 AUX@32.5C CPU@34C thermal@33.4C Tboard@35C AO@34C GPU@34C Tdiode@36.25C PMIC@50C GPU 0mW/0mW CPU 775mW/775mW SOC 998mW/998mW CV 0mW/0mW VDDRQ 221mW/221mW SYS5V 2985mW/2985mW
==============NVSMI LOG==============
Timestamp : Mon Nov 25 09:59:22 2024
Driver Version : 535.183.01
CUDA Version : 12.2
Attached GPUs : 8
GPU 00000000:18:00.0
Temperature
GPU Current Temp : 50 C
GPU T.Limit Temp : 37 C
GPU Shutdown T.Limit Temp : -8 C
GPU Slowdown T.Limit Temp : -2 C
GPU Max Operating T.Limit Temp : 0 C
GPU Target Temperature : N/A
Memory Current Temp : 55 C
Memory Max Operating T.Limit Temp : 0 C
const int readSize = 10240;
static int gpuIter = 0;
char data[readSize + 1];
int curPos = 0;
do {
read(handle, data + curPos, sizeof(char));
} while (data[curPos++] != '\n');
data[curPos - 1] = 0;
#if IS_JETSON
std::string data_str(data);
std::regex pattern("GPU@([0-9]+)C");
std::smatch matches;
if (std::regex_search(data_str, matches, pattern)) {
if (matches.size() > 1) {
int tempValue = std::stoi(matches[1]);
temps->at(gpuIter) = tempValue;
gpuIter = (gpuIter + 1) % (temps->size());
}
}
#else
// FIXME: The syntax of this print might change in the future..
int tempValue;
if (sscanf(data,
" GPU Current Temp : %d C",
&tempValue) == 1) {
temps->at(gpuIter) = tempValue;
gpuIter = (gpuIter + 1) % (temps->size());
} else if (!strcmp(data, " Gpu "
" : N/A"))
gpuIter =
(gpuIter + 1) %
(temps->size()); // We rotate the iterator for N/A values as well
#endif
updateTemps() 不停的从 pollTemp() 返回的管道中读取 GPU 温度信息,同样区分是 Jetson 设备还是非 jetson 设备;
compare.cu 中主要定义了cuda 运行的核函数;
extern "C" __global__ void compare(float *C, int *faultyElems, size_t iters)
size_t iterStep = blockDim.x*blockDim.y*gridDim.x*gridDim.y;
size_t myIndex = (blockIdx.y*blockDim.y + threadIdx.y)* // Y
gridDim.x*blockDim.x + // W
blockIdx.x*blockDim.x + threadIdx.x; // X
int myFaulty = 0;
for (size_t i = 1; i < iters; ++i)
if (fabsf(C[myIndex] - C[myIndex + i*iterStep]) > EPSILON)
myFaulty++;
atomicAdd(faultyElems, myFaulty);
extern "C" __global__ void compareD(double *C, int *faultyElems, size_t iters)
size_t iterStep = blockDim.x*blockDim.y*gridDim.x*gridDim.y;
size_t myIndex = (blockIdx.y*blockDim.y + threadIdx.y)* // Y
gridDim.x*blockDim.x + // W
blockIdx.x*blockDim.x + threadIdx.x; // X
int myFaulty = 0;
for (size_t i = 1; i < iters; ++i)
if (fabs(C[myIndex] - C[myIndex + i*iterStep]) > EPSILOND)
myFaulty++;
atomicAdd(faultyElems, myFaulty);
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。