我编写了一个在Matlab中使用的前景提取内核,事情并没有打印任何东西所以我把它移植到纯Cuda C并取出了大部分逻辑。这件事没有做任何事情,甚至在返回之前都没有打印cuPrintf语句,任何想法为什么?
#include <cuda.h>
#include <stdio.h> /* printf, scanf, NULL */
#include <stdlib.h> /* calloc, exit, free */
#include "cuPrintf.cu"
#include "utils.h"
#include <time.h> /* clock_t, clock, CLOCKS_PER_SEC */
__global__ void foreground_extract( unsigned char* inputImageRed,
unsigned char* inputImageGreen,
unsigned char* inputImageBlue,
unsigned char* outputImageRed,
unsigned char* outputImageGreen,
unsigned char* outputImageBlue,
const int xDim,
const int yDim)
{
cuPrintf("print something \n");
//x = col, y = row
//xDim = col_dim, yDim = row_dim
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y *blockDim.x *gridDim.x;
int nnodes = xDim*yDim;
if (offset >= nnodes) return;
//test equality
outputImageRed[offset] = inputImageRed[offset];
outputImageGreen[offset] = inputImageGreen[offset];
outputImageBlue[offset] = inputImageBlue[offset];
cuPrintf("print something here too \n");
cuPrintf("%d \n", outputImageRed[offset]);
}
int main()
{
int xDim = 3;
int yDim = 3;
unsigned char* h_inputImageRed;
unsigned char* h_inputImageGreen;
unsigned char* h_inputImageBlue;
unsigned char* h_outputImageRed;
unsigned char* h_outputImageGreen;
unsigned char* h_outputImageBlue;
h_inputImageRed = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
h_inputImageGreen = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
h_inputImageBlue = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
h_outputImageRed = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
h_outputImageGreen = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
h_outputImageBlue = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
//initiate input only
unsigned char init =0;
for (int i=0; i<(xDim*yDim);i++){
h_inputImageRed[i] = init;
h_inputImageGreen[i] = init;
h_inputImageBlue[i] = init;
init++;
printf("%d\n", h_inputImageRed[i]);
}
//device arrays
unsigned char* d_inputImageRed;
unsigned char* d_inputImageGreen;
unsigned char* d_inputImageBlue;
unsigned char* d_outputImageRed;
unsigned char* d_outputImageGreen;
unsigned char* d_outputImageBlue;
//cudaMallocs
checkCudaErrors(cudaMalloc((void**)&d_inputImageRed, (sizeof(unsigned char)*xDim*yDim)));
checkCudaErrors(cudaMalloc((void**)&d_inputImageGreen, (sizeof(unsigned char)*xDim*yDim)));
checkCudaErrors(cudaMalloc((void**)&d_inputImageBlue, (sizeof(unsigned char)*xDim*yDim)));
checkCudaErrors(cudaMalloc((void**)&d_outputImageRed, (sizeof(unsigned char)*xDim*yDim)));
checkCudaErrors(cudaMalloc((void**)&d_outputImageGreen, (sizeof(unsigned char)*xDim*yDim)));
checkCudaErrors(cudaMalloc((void**)&d_outputImageBlue, (sizeof(unsigned char)*xDim*yDim)));
//cudaMemcpys, Host to Device
checkCudaErrors(cudaMemcpy(d_inputImageRed, h_inputImageRed, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_inputImageGreen, h_inputImageGreen, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_inputImageBlue, h_inputImageBlue, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_outputImageRed, h_outputImageRed, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_outputImageGreen, h_outputImageGreen, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_outputImageBlue, h_outputImageBlue, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
cudaPrintfInit();
int gridSizeX = ceil(float(xDim/8));
int gridSizeY = ceil(float(yDim/8));
int gridSizeZ = 1;
int blockSizeX=8;
int blockSizeY=8;
int blockSizeZ=1;
const dim3 gridSize(gridSizeX,gridSizeY,gridSizeZ);
const dim3 blockSize(blockSizeX,blockSizeY,blockSizeZ);
foreground_extract <<< gridSize, blockSize >>>(d_inputImageRed,
d_inputImageGreen,
d_inputImageBlue,
d_outputImageRed,
d_outputImageGreen,
d_outputImageBlue,
xDim,yDim);
cudaPrintfDisplay(stdout,true);
cudaPrintfEnd();
checkCudaErrors(cudaMemcpy(h_outputImageRed, d_outputImageRed, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_outputImageGreen, d_outputImageGreen, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_outputImageBlue, d_outputImageBlue, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyDeviceToHost));
//free gpu data
checkCudaErrors( cudaFree(d_outputImageRed) );
checkCudaErrors( cudaFree(d_outputImageGreen) );
checkCudaErrors( cudaFree(d_outputImageBlue) );
checkCudaErrors( cudaFree(d_inputImageRed) );
checkCudaErrors( cudaFree(d_inputImageGreen) );
checkCudaErrors( cudaFree(d_inputImageBlue) );
//free host data
free(h_outputImageRed);
free(h_outputImageGreen);
free(h_outputImageBlue);
free(h_inputImageRed);
free(h_inputImageGreen);
free(h_inputImageBlue);
while(true){}
return 0;
}
答案 0 :(得分:3)
你的内核没有启动,这就是为什么你没有得到内核中printf的输出。 如果你在内核启动上做了正确的cuda error checking ,你会发现这一点。
内核启动返回的错误是invalid configuration argument
。
您在gridSize.x
和gridSize.y
中传递了无效值。
如果您想查看它们是什么,请在调用内核之前将其打印出来。 (一般调试提示。)
让我们来看看这一行,因为它没有按照你的想法行事:
int gridSizeX = ceil(float(xDim/8));
^ ^
both values inside the parenthesis are *integers*
您尚未将这两个值(xDim
或8
)投放到float
。因此主机编译器使用整数除法来解析括号内的数量。整数除以3/8为零。之后没有任何改变价值。仍为零。