我需要找到由 unsigned short 数组表示的数千(20,000+)个图像的平均值。你可以检查我,它找我这个代码不是最佳的:
我的内核:
__global__ void VecAdd(unsigned short *A, float *B, unsigned int Size, float div){
register float divider = div;
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] / divider + B[idx];
}
//__syncthreads();
}
内核包装器
void kernel_wrapper(unsigned short* pixels1, float* pixels2, unsigned int length, float div)
{
unsigned short* deviceData1;
float* deviceData2;
cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
cudaMalloc((void**)&deviceData2, length * sizeof(float));
cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd<<< blocks, threads >>>( deviceData1, deviceData2, length, div );
cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree( deviceData1 );
cudaFree( deviceData2 );
}`
我做了
float* avrg2f = (float*)malloc( width * height * sizeof(float));
memset( avrg2f, 0.0, sizeof(float) * width * height);
for (int k = 0; k < count; k++) {
imageObjectList.at( curObj )->getImage( k );
kernel_wrapper( avrg1, avrg2f, height * width, (float)count);
}
结果可能是平均图像将在avrg2f;
谢谢。
答案 0 :(得分:2)
如果图片大小相同,那么您的包装函数无需在每次调用时执行cudaMalloc
和cudaFree
操作。
预先分配所需的存储空间,并且不会在每次调用包装器时分配和释放它。
此外,如果您在主机端使用固定分配(cudaMemcpy
)进行图像存储,则可能会看到~2x加速(对于cudaHostAlloc
操作)。
最后,在循环期间,无需将结果复制回主机。完成计算平均值后执行此操作。这将保存您在包装器中执行的3个cudaMemcpy
操作中的2个。
虽然我们正在使用它,但我认为使用memset
初始化float
数组是值得怀疑的。它适用于零值,但基本上没有其他值。此外,我希望将0.0
作为memset
的第二个参数传递给至少抛出编译器警告。
以下代码显示了上述优化,并演示了在我的测试用例中代码速度提高了8倍:
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
__global__ void VecAdd(unsigned short *A, float *B, unsigned int Size, float div){
register float divider = div;
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] / divider + B[idx];
}
//__syncthreads();
}
__global__ void VecAdd2(unsigned short *A, float *B, unsigned int Size, float mult){
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] * mult + B[idx];
}
}
void kernel_wrapper(unsigned short* pixels1, float* pixels2, unsigned int length, float div)
{
unsigned short* deviceData1;
float* deviceData2;
cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
cudaMalloc((void**)&deviceData2, length * sizeof(float));
cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd<<< blocks, threads >>>( deviceData1, deviceData2, length, div );
cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree( deviceData1 );
cudaFree( deviceData2 );
}
void kernel_wrapper2(unsigned short* h_pixels1, unsigned short* d_pixels1, float* d_pixels2, unsigned int length, float my_mult)
{
cudaMemcpy(d_pixels1, h_pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd2<<< blocks, threads >>>( d_pixels1, d_pixels2, length, my_mult );
}
int main(){
const int count = 2000;
const int width = 1280;
const int height = 960;
timeval t1, t2;
unsigned long et;
unsigned short *h1_image;
h1_image = (unsigned short *)malloc(height*width*sizeof(unsigned short));
float* avrg2f = (float*)malloc( width * height * sizeof(float));
for (int i = 0; i<height*width; i++){
h1_image[i] = (i%256);
avrg2f[i] = 0.0f;
}
gettimeofday(&t1,NULL);
for (int k = 0; k < count; k++) {
kernel_wrapper( h1_image, avrg2f, height * width, (float)count);
}
gettimeofday(&t2,NULL);
et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
printf("time 1 = %ld us\n", et);
unsigned short *h2_image;
float* avrg3f = (float*)malloc( width * height * sizeof(float));
cudaHostAlloc((void **)&h2_image, height*width*sizeof(unsigned short), cudaHostAllocDefault);
for (int i = 0; i<height*width; i++){
h2_image[i] = (i%256);
avrg3f[i] = 0.0f;
}
gettimeofday(&t1,NULL);
unsigned short *d_image;
float *d_result;
cudaMalloc((void **)&d_image, height*width*sizeof(unsigned short));
cudaMalloc((void **)&d_result, height*width*sizeof(float));
cudaMemcpy(d_result, avrg3f, height*width*sizeof(float), cudaMemcpyHostToDevice);
for (int k = 0; k < count; k++) {
kernel_wrapper2( h2_image, d_image, d_result, height * width, (float)(1/(float)count));
}
cudaMemcpy(avrg3f, d_result, height*width*sizeof(float), cudaMemcpyDeviceToHost);
gettimeofday(&t2,NULL);
et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
printf("time 2 = %ld us\n", et);
for (int i = 0; i < (height*width); i++)
if (fabs(avrg2f[i] - avrg3f[i]) > 0.0001) {printf("mismatch at %d, 1 = %f, 2 = %f\n", i, avrg2f[i], avrg3f[i]); return 1;}
return 0;
}