我有一个低级别的循环我用C写的,朋友建议我用CUDA写。我已经设置了我的CUDA环境,并且一直在查看文档,但我现在仍然在努力学习2周以上的语法。谁能帮我吗?这在CUDA中会是什么样的?
float* red = new float [N];
float* green = new float [N];
float* blue = new float [N];
for (int y = 0; y < h; y++)
{
// Get row ptr from the color image
const unsigned char* src = rowptr<unsigned char>(color, 0, y, w);
// Get row ptrs for the destination channel features
float* rptr = rowptr<float>(red, 0, y, w);
float* gptr = rowptr<float>(green, 0, y, w);
float* bptr = rowptr<float>(blue, 0, y, w);
for (int x = 0; x < w; x++)
{
*rptr++ = (float)*src++;
*gptr++ = (float)*src++;
*bptr++ = (float)*src++;
}
}
答案 0 :(得分:1)
以下是一些示例代码。我不知道它是否真的能回答你的问题。您可能需要了解有关CUDA的更多信息。如果您可以节省时间,那么从this webinar获取this webinar和nvidia webinar page将花费2个小时。 cuda C programmers manual也是一个很好的可读参考。
#include <stdio.h>
#define N 256
#define NUMROW N
#define NUMCOL N
#define PIXSIZE 3
#define REDOFF 0
#define GREENOFF 1
#define BLUEOFF 2
#define nTPB 16
#define GRNVAL 5
#define REDVAL 7
#define BLUVAL 9
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void kern(const unsigned numrow, const unsigned numcol, const unsigned char* src, float* rptr, float* gptr, float* bptr){
unsigned idx = threadIdx.x + (blockDim.x*blockIdx.x);
unsigned idy = threadIdx.y + (blockDim.y*blockIdx.y);
if ((idx < numcol) && (idy < numrow)){
rptr[(idy*numcol)+idx] = (float)src[(((idy*numcol)+idx)*PIXSIZE)+REDOFF];
gptr[(idy*numcol)+idx] = (float)src[(((idy*numcol)+idx)*PIXSIZE)+GREENOFF];
bptr[(idy*numcol)+idx] = (float)src[(((idy*numcol)+idx)*PIXSIZE)+BLUEOFF];
}
}
int main (){
float *h_red, *h_green, *h_blue;
float *d_red, *d_green, *d_blue;
unsigned char *h_img, *d_img;
if ((h_img =(unsigned char*)malloc(NUMROW*NUMCOL*PIXSIZE*sizeof(unsigned char))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_red =(float*)malloc(NUMROW*NUMCOL*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_green =(float*)malloc(NUMROW*NUMCOL*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_blue =(float*)malloc(NUMROW*NUMCOL*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_img, (NUMROW*NUMCOL*PIXSIZE)*sizeof(unsigned char));
cudaCheckErrors("cudaMalloc1 fail");
cudaMalloc((void **)&d_red, (NUMROW*NUMCOL)*sizeof(float));
cudaCheckErrors("cudaMalloc2 fail");
cudaMalloc((void **)&d_green, (NUMROW*NUMCOL)*sizeof(float));
cudaCheckErrors("cudaMalloc3 fail");
cudaMalloc((void **)&d_blue, (NUMROW*NUMCOL)*sizeof(float));
cudaCheckErrors("cudaMalloc4 fail");
for (int i=0; i<NUMROW*NUMCOL; i++){
h_img[(i*PIXSIZE)+ REDOFF] = REDVAL;
h_img[(i*PIXSIZE)+ GREENOFF] = GRNVAL;
h_img[(i*PIXSIZE)+ BLUEOFF] = BLUVAL;
}
cudaMemcpy(d_img, h_img, (NUMROW*NUMCOL*PIXSIZE)*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
dim3 block(nTPB, nTPB);
dim3 grid(((NUMCOL+nTPB-1)/nTPB),((NUMROW+nTPB-1)/nTPB));
kern<<<grid,block>>>(NUMROW, NUMCOL, d_img, d_red, d_green, d_blue);
cudaMemcpy(h_red, d_red, (NUMROW*NUMCOL)*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy2 fail");
cudaMemcpy(h_green, d_green, (NUMROW*NUMCOL)*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy3 fail");
cudaMemcpy(h_blue, d_blue, (NUMROW*NUMCOL)*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy4 fail");
for (int i=0; i<(NUMROW*NUMCOL); i++){
if (h_red[i] != REDVAL) {printf("Red mismatch at offset %d\n", i); return 1;}
if (h_green[i] != GRNVAL) {printf("Green mismatch at offset %d\n", i); return 1;}
if (h_blue[i] != BLUVAL) {printf("Blue mismatch at offset %d\n", i); return 1;}
}
printf("Success!\n");
return 0;
}
在回答评论中提出的问题时,这里有一个修改过的内核,它显示了如何使用rowptr&lt;&gt;模板在评论中定义。只需用以下代码替换上面的内核代码:
template <typename T> T* rowptr(T* start, int x, int y, int w) __device__ __host__ { return start + y*w + x; }
__global__ void kern(const unsigned numrow, const unsigned numcol, unsigned char* isrc, float* rptr, float* gptr, float* bptr){
unsigned idx = threadIdx.x + (blockDim.x*blockIdx.x);
unsigned idy = threadIdx.y + (blockDim.y*blockIdx.y);
if ((idx < numcol) && (idy < numrow)){
unsigned char *src = rowptr<unsigned char>(isrc, (idx*PIXSIZE), idy, (numcol*PIXSIZE));
rptr[(idy*numcol)+idx] = (float)*src++;
gptr[(idy*numcol)+idx] = (float)*src++;
bptr[(idy*numcol)+idx] = (float)*src;
}
}
答案 1 :(得分:0)
由于此代码是一个简单的for循环,而不是编写CUDA,另一种选择是检查openACC。这样,您只需要将指令添加到现有代码中。