我有这个
cudaMemcpy(gpu_output, d_output, kMemSize, cudaMemcpyDeviceToHost);
cudaMemcpy( d_input, gpu_output, kMemSize, cudaMemcpyHostToDevice);
我必须通过将输入方向指向输出方向(假设)来避免使用这些Memcpy。 我该怎么做?
这是完整的代码:
__global__ void medianFilter1D_col(
unsigned char *d_output,
unsigned char *d_input)
{
int col, row;
unsigned char temp;
int idx, idx_south, idx_north, idx_west, idx_east, idx_north_west, idx_north_east, idx_south_east, idx_south_west;
int numcols = WIDTH + 2;
row = blockIdx.x * blockDim.x + threadIdx.x + 1;
for (col = 1; col <= WIDTH; ++col)
{
unsigned char neighborhood[9];
idx = row * numcols + col;
idx_south = (row - 1) * numcols + col;
idx_north = (row + 1) * numcols + col;
idx_west = row * numcols + (col - 1);
idx_east = row * numcols + (col + 1);
idx_north_east = (row + 1) * numcols + (col + 1);
idx_north_west = (row + 1) * numcols + (col - 1);
idx_south_east = (row - 1) * numcols + (col + 1);
idx_south_west = (row - 1) * numcols + (col - 1);
neighborhood[0]= d_input[ idx_south_west ];
neighborhood[1]= d_input[ idx_south ];
neighborhood[2]= d_input[ idx_south_east ];
neighborhood[3]= d_input[ idx_west ];
neighborhood[4]= d_input[ idx ];
neighborhood[5]= d_input[ idx_east ];
neighborhood[6]= d_input[ idx_north_west ];
neighborhood[7]= d_input[ idx_north ];
neighborhood[8]= d_input[ idx_north_east ];
for (unsigned int j=0; j<5; ++j)
{
int min=j;
for (unsigned int i=j+1; i<9; ++i)
if (neighborhood[i] < neighborhood[min])
min=i;
temp=neighborhood[j];
neighborhood[j]=neighborhood[min];
neighborhood[min]=temp;
}
d_output[idx] = neighborhood[4];
}
}
int main(int argc, char *argv[])
{
int x, y;
int i;
int errors;
double start_time_inc_data, end_time_inc_data;
double cpu_start_time, cpu_end_time;
unsigned char *d_input, *d_output, *d_edge, *tmp;
unsigned char *input_image;
unsigned char *output_image;
int rows;
int cols;
// Alojamos memoria en el host para alojar la imagen
input_image = (unsigned char*)calloc(((HEIGHT * WIDTH) * 1), sizeof(unsigned char));
// Leemos la imagen
BMP Image;
Image.ReadFromFile("lena_1024_noise.bmp");
for( int i=0 ; i < Image.TellHeight() ; i++)
for( int j=0 ; j < Image.TellWidth() ; j++)
input_image[i*WIDTH+j]=Image(i,j)->Red;
// Inicializamos a cero el array de CPU para asegurar que el
// halo tiene valores correctos
for (y = 0; y < HEIGHT + 2; y++)
for (x = 0; x < WIDTH + 2; x++)
host_input[y][x] = 0;
// Copiamos la imagen al array de CPU con el halo
for (y = 0; y < HEIGHT; y++)
for (x = 0; x < WIDTH; x++)
host_input[y + 1][x + 1] = input_image[y*WIDTH + x];
// Calculamos memoria necesaria para alojar la imagen junto con el halo
// en la memoria de la GPU.
const int kMemSize = (WIDTH+2) * (HEIGHT+2) * sizeof(unsigned char);
// Reservamos memoria en la GPU
cudaMalloc(&d_input, kMemSize);
cudaMalloc(&d_output, kMemSize);
// Copiamos todos los arrays a la memoria de la GPU.
// Tenemos en cuenta dichas transferencias en el tiempo de ejecución.
start_time_inc_data = get_current_time();
cudaMemcpy( d_input, host_input, kMemSize, cudaMemcpyHostToDevice);
cudaMemcpy( d_output, host_input, kMemSize, cudaMemcpyHostToDevice);
// Aplicamos el filtro mediana un número determinado de iteraciones.
for (i = 0; i < ITERATIONS; ++i)
{
// Ejecución kernel 1D por filas
dim3 blocksPerGrid(GRID_H, 1, 1);
dim3 threadsPerBlock(BLOCK_H, 1, 1);
//std::cout << "Grid size: (" << blocksPerGrid.x << ", " << blocksPerGrid.y << ", " << blocksPerGrid.z << ")\n";
//std::cout << "Block size: (" << threadsPerBlock.x << ", " << threadsPerBlock.y << ", " << threadsPerBlock.z << ")\n";
medianFilter1D_col<<<blocksPerGrid, threadsPerBlock>>>(d_output, d_input);
// Ejecución kernel 1D por columnas
//TODO - Calcular tamaño de bloque y grid para la correcta ejecucion del kernel
/*dim3 blocksPerGrid();
dim3 threadsPerBlock();
medianFilter1D_row<<<blocksPerGrid, threadsPerBlock>>>(d_output, d_input);*/
// Ejecución kernel 2D
// TO DO - Calcular tamaño de bloque y grid para la correcta ejecucion del kernel
/*dim3 blocksPerGrid(,);
dim3 threadsPerBlock(,);
medianFilter2D<<< blocksPerGrid, threadsPerBlock >>>(d_output, d_input);*/
cudaThreadSynchronize();
// Copiamos en la memoria de la CPU el resultado obtenido
cudaMemcpy(gpu_output, d_output, kMemSize, cudaMemcpyDeviceToHost);
// Copiamos el resultado de la GPU hacia la entrada para procesar la siguiente iteración */
cudaMemcpy( d_input, gpu_output, kMemSize, cudaMemcpyHostToDevice);
// TODO: Estas copias de memoria se pueden evitar, para ello comenta las
// transferencias anteriores e intercambia los punteros d_input y d_output
// para que la salida de esta iteración se convierta en la entrada de la
// siguiente iteración del filtro mediana.
}
cudaMemcpy(gpu_output, d_input, kMemSize, cudaMemcpyDeviceToHost);
end_time_inc_data = get_current_time();
checkCUDAError("Filtro mediana CUDA: ");
cpu_start_time = get_current_time();
unsigned char temp;
int idx, idx_south, idx_north, idx_west, idx_east, idx_north_west, idx_north_east, idx_south_east, idx_south_west;
int numcols = WIDTH + 2;
unsigned char neighborhood[9];
for (i = 0; i < ITERATIONS; i++)
{
for (y = 0; y < HEIGHT; y++)
{
for (x = 0; x < WIDTH; x++)
{
neighborhood[0]= host_input[ y+1 -1 ][ x+1 -1 ];
neighborhood[1]= host_input[ y+1 -1 ][ x+1 ];
neighborhood[2]= host_input[ y+1 -1][ x+1 +1 ];
neighborhood[3]= host_input[ y+1 ][ x+1 -1 ];
neighborhood[4]= host_input[ y+1 ][ x+1 ];
neighborhood[5]= host_input[ y+1 ][ x+1 +1 ];
neighborhood[6]= host_input[ y+1+1 ][ x+1 -1 ];
neighborhood[7]= host_input[ y+1+1 ][ x+1 ];
neighborhood[8]= host_input[ y+1+1 ][ x+1 +1];
int j=0;
for (j=0; j<5; ++j)
{
// Encontramos el mínimo
int mini=j;
for (int l=j+1; l<9; ++l)
{
if (neighborhood[l] < neighborhood[mini])
mini=l;
}
temp=neighborhood[j];
neighborhood[j]=neighborhood[mini];
neighborhood[mini]=temp;
}
host_output[y+1][x+1]=neighborhood[4];
}
}
for (y = 0; y < HEIGHT; y++)
for (x = 0; x < WIDTH; x++)
host_input[y+1][x+1] = host_output[y+1][x+1];
}
cpu_end_time = get_current_time();
errors = 0;
for (y = 0; y < HEIGHT; y++)
{
for (x = 0; x < WIDTH; x++)
{
if ( host_input[y+1][x+1] != gpu_output[y+1][x+1])
{
errors++;
printf("Error en %d,%d (CPU=%i, GPU=%i)\n", x, y, \
host_output[y+1][x+1], \
gpu_output[y+1][x+1]);
}
}
}
if (errors == 0)
std::cout << "\n\n ***TEST CORRECTO*** \n\n\n";
output_image = (unsigned char*)calloc(((WIDTH * HEIGHT) * 1), sizeof(unsigned char));
for (y = 0; y < HEIGHT; y++)
for (x = 0; x < WIDTH; x++)
output_image[y*WIDTH+x] = gpu_output[y+1][x+1];
cudaFree(d_input);
cudaFree(d_output);
printf("Tiempo ejecución GPU (Incluyendo transferencia de datos): %fs\n", \
end_time_inc_data - start_time_inc_data);
printf("Tiempo de ejecución en la CPU : %fs\n", \
cpu_end_time - cpu_start_time);
for( int i=0 ; i < Image.TellHeight() ; i++)
{
for( int j=0 ; j < Image.TellWidth() ; j++)
{
Image(i,j)->Red = output_image[i*WIDTH+j];
Image(i,j)->Green = output_image[i*WIDTH+j];
Image(i,j)->Blue = output_image[i*WIDTH+j];
}
}
// Guardamos el resultado de aplicar el filtro en un nuevo fichero
Image.WriteToFile("lena_1024_median.bmp");
std::cout << "Resultado escrito en lena_1024_median.bmp\n";
getchar();
return 0;
}
#if _WIN32
void getCurrentTimeStamp(timeStamp& _time)
{
QueryPerformanceCounter(&_time);
}
timeStamp getCurrentTimeStamp()
{
timeStamp tmp;
QueryPerformanceCounter(&tmp);
return tmp;
}
double getTimeMili()
{
timeStamp start;
timeStamp dwFreq;
QueryPerformanceFrequency(&dwFreq);
QueryPerformanceCounter(&start);
return double(start.QuadPart) / double(dwFreq.QuadPart);
}
#endif
double get_current_time()
{
#if _WIN32
return getTimeMili();
#else
static int start = 0, startu = 0;
struct timeval tval;
double result;
if (gettimeofday(&tval, NULL) == -1)
result = -1.0;
else if(!start) {
start = tval.tv_sec;
startu = tval.tv_usec;
result = 0.0;
}
else
result = (double) (tval.tv_sec - start) + 1.0e-6*(tval.tv_usec - startu);
return result;
#endif
}
必须修改或纠正图像的噪音
答案 0 :(得分:3)
据推测,你有这样的事情:
kernel1<<<...>>>(..., d_output, ...);
cudaMemcpy(gpu_output, d_output, kMemSize, cudaMemcpyDeviceToHost);
cudaMemcpy( d_input, gpu_output, kMemSize, cudaMemcpyHostToDevice);
kernel2<<<...>>>(d_input, ...);
在这种情况下,您可以通过以下方式避免这些复制操作:
kernel1<<<...>>>(..., d_output, ...);
kernel2<<<...>>>(d_output, ...);
这实际上只是在C中使用指针,它并不是真正特定于CUDA。使用普通的C函数和指针可以进行类似的操作。
编辑:既然您已经提供了完整的代码,它可能看起来像这样:
unsigned char *d_tmp = d_output;
d_output = d_input;
d_input = d_tmp;
for (i = 0; i < ITERATIONS; ++i)
{
// Ejecución kernel 1D por filas
dim3 blocksPerGrid(GRID_H, 1, 1);
dim3 threadsPerBlock(BLOCK_H, 1, 1);
d_tmp = d_output;
d_output = d_input;
d_input = d_tmp;
medianFilter1D_col<<<blocksPerGrid, threadsPerBlock>>>(d_output, d_input);
cudaDeviceSynchronize();
}