我编写了一些内核函数,并想知道处理这些函数需要多少毫秒。
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define N 8000
void fillArray(int *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() % 100;
}
__global__ void add(int* a, int *b) {
int add = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
add = a[tid] + b[tid];
}
}
__global__ void subtract(int* a, int *b) {
int subtract = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
subtract = a[tid] - b[tid];
}
}
__global__ void multiply(int* a, int *b) {
int multiply = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
multiply = a[tid] * b[tid];
}
}
__global__ void divide(int* a, int *b) {
int divide = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
divide = a[tid] / b[tid];
}
}
__global__ void modu(int* a, int *b) {
int modulus = 0;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
modulus = a[tid] % b[tid];
}
}
__global__ void neg(int *data) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
data[tid] = -data[tid];
}
}
float duration(int *devA, int *devB, int blocksPerGrid, int threadsPerBlock) {
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
neg<<<blocksPerGrid, threadsPerBlock>>>(devA);
neg<<<blocksPerGrid, threadsPerBlock>>>(devB);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return elapsedTime;
}
int main(void) {
int a[N], b[N];
float dur = 0;
int *devA, *devB;
cudaMalloc((void**) &devA, N * sizeof(int));
cudaMalloc((void**) &devB, N * sizeof(int));
fillArray(a, N);
fillArray(b, N);
cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devA, b, N * sizeof(int), cudaMemcpyHostToDevice);
dur = duration(a, b, N, 1);
cout << "Global memory version:\n";
cout << "Process completed in " << dur;
cout << " for a data set of " << N << " integers.";
return 0;
}
Milisecond总是返回零。为什么?我在这里缺少什么?如果我从持续时间持续时间函数中删除了neg函数。它返回0.15687毫秒。我认为处理这些功能的数量很少。该计划有什么问题?
编辑后,我这样做了:
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
const int N = 8000;
void fillArray(int *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() % 100;
}
__global__ void add(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
__global__ void subtract(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] - b[tid];
}
}
__global__ void multiply(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] * b[tid];
}
}
__global__ void divide(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] / b[tid];
}
}
__global__ void modu(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] % b[tid];
}
}
__global__ void neg(int *data, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = -data[tid];
}
}
float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
double hArrayC[N];
add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return elapsedTime;
}
int main(void) {
int a[N], b[N],c[N];
float dur = 0;
int *devA, *devB,*devC;
cudaMalloc((void**) &devA, N * sizeof(int));
cudaMalloc((void**) &devB, N * sizeof(int));
cudaMalloc((void**) &devC, N * sizeof(int));
fillArray(a, N);
fillArray(b, N);
cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devC, c, N * sizeof(int), cudaMemcpyHostToDevice);
dur = duration(devA, devB, devC,N, 1);
cout << "Global memory version:\n";
cout << "Process completed in " << dur;
cout << " for a data set of " << N << " integers.";
cudaFree(devA);
cudaFree(devB);
return 0;
}
答案 0 :(得分:0)
Cuda
任务正在设备上运行而不会阻塞CPU线程。因此,cuda
调用仅在您尝试从设备内存获取计算数据时才会阻止,并且尚未准备就绪。或者,当您使用cudaDeviceSyncronize()
调用显式地将CPU线程与GPU同步时。如果要测量计算时间,则需要在停止计时器之前进行同步。
如果您对计算内存复制时间感兴趣,则需要在计算开始后和复制计时器开始之前进行同步,否则计算时间将显示为复制时间。
您可以使用cuda
SDK中包含的探查器来衡量所有cuda
次来电的时间。
答案 1 :(得分:0)
您的内核没有做任何事情,因为您只将结果存储在寄存器中。编译时,会收到一些警告:
kernel.cu(13):警告:变量“add”已设置但从未使用过
此外,如果您想看一些更好的时间,请使用NVIDIA的分析器:nvprof
(CLI)或nvvp
(GUI)。
$ nvprof ./kernel
======== NVPROF is profiling kernel... ======== Command: kernel Global memory version: Process completed in 0 for a data set of 8000 integers. ======== Profiling result: Time(%) Time Calls Avg Min Max Name 100.00 18.46us 2 9.23us 6.02us 12.45us [CUDA memcpy HtoD] 0.00 0ns 1 0ns 0ns 0ns multiply(int*, int*) 0.00 0ns 1 0ns 0ns 0ns add(int*, int*) 0.00 0ns 1 0ns 0ns 0ns modu(int*, int*) 0.00 0ns 2 0ns 0ns 0ns neg(int*) 0.00 0ns 1 0ns 0ns 0ns subtract(int*, int*) 0.00 0ns 1 0ns 0ns 0ns divide(int*, int*)
您还在每个网格使用N
个块,每个块使用1个线程。您应该考虑阅读this question的答案。
关于向量添加(以及其他简单操作)本身,您应该研究CUDA SDK的vectorAdd sample,或使用Thrust。第一个选项将教您如何使用CUDA,第二个选项将向您展示您可以使用Thrust执行的高级操作。如果我是你,我会做两件事。
答案 2 :(得分:-1)
尝试使用float
(或double
)变量和数组而不是int
来存储所有算术变量和操作。有时候时间间隔太小,整数值总是会舍入到零。