这是我的代码:
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
const int N = 8000;
void fillArray(int *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() % 100;
}
__global__ void add(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
__global__ void subtract(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] - b[tid];
}
}
__global__ void multiply(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] * b[tid];
}
}
__global__ void divide(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] / b[tid];
}
}
__global__ void modu(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] % b[tid];
}
}
__global__ void neg(int *data, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = -data[tid];
}
}
float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
int hArrayC[N];
add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return elapsedTime;
}
int main(void) {
int *a, *b;
a = new int[N];
b = new int [N];
float dur = 0;
int *devA, *devB,*devC;
cudaMalloc((void**) &devA, N * sizeof(int));
cudaMalloc((void**) &devB, N * sizeof(int));
cudaMalloc((void**) &devC, N * sizeof(int));
fillArray(a, N);
fillArray(b, N);
cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);
dur = duration(devA, devB, devC,N, 1);
cout << "Global memory version:\n";
cout << "Process completed in " << dur;
cout << " for a data set of " << N << " integers.";
cudaFree(devA);
cudaFree(devB);
delete [] a;
delete [] b;
return 0;
}
我想知道持续时间函数的总毫秒数。但是,毫秒总是以不同的值返回。有时它是10毫秒有时它是0.78652有时它是30毫秒。为什么?我的代码出了什么问题?
答案 0 :(得分:1)
这可能是由NVIDIA驱动程序的加载/卸载引起的。将其视为GPU的初始化步骤。
您可以将GPU设置为持久模式:
nvidia-smi -pm 1
或者您可以在计算GPU代码以触发加载驱动程序之前运行虚拟内核:
__global__ void dummy()
{
// This kernel does nothing, this is just a "warm-up"
}
// Before your cudaEventRecord etc.
dummy<<<blocksPerGrid, threadsPerBlock>>>();
或者可以在计算内核之前使用cudaThreadSynchronize()
。