我正在使用CUDA编写程序,尽管输入文件相同,但我得到了不同的输出。我使用的是Ubuntu和geforce gtx 960 M GPU。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "device_functions.h"
//rows and columns
const int rows_img = 320;
const int columns_img = 640;
const double max_rho_global = sqrt((rows_img*rows_img) + (columns_img*columns_img));
const long size_img = rows_img * columns_img;
const short resolution = 180;//arbitrary
__global__ void sobel(int* input_img, int* d_sobel_img)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
short gx[9] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
short gy[9] = {1, 2, 1, 0, 0, 0, -1, -2, -1};
short sobel_pixel;
if(offset<2005 && offset > 2000)
{
printf("index: %d \n", offset);
}
//do not access values beyond image size and do not consider first and last row or first and last column
if((offset < size_img) && (offset > columns_img) && (offset < size_img - columns_img) && (offset % columns_img != 0) && (offset % columns_img != 1))
{
double sobel_x = gx[0]*input_img[offset - columns_img - 1] + gx[1]*input_img[offset - columns_img] + gx[2]*input_img[offset - columns_img + 1] + gx[3]*input_img[offset - 1] + gx[4]*input_img[offset] + gx[5]*input_img[offset + 1] + gx[6]*input_img[offset + columns_img - 1] + gx[7]*input_img[offset + columns_img] + gx[8]*input_img[offset + columns_img + 1];
double sobel_y = gy[0]*input_img[offset - columns_img - 1] + gy[1]*input_img[offset - columns_img] + gy[2]*input_img[offset - columns_img + 1] + gy[3]*input_img[offset - 1] + gy[4]*input_img[offset] + gy[5]*input_img[offset + 1] + gy[6]*input_img[offset + columns_img - 1] + gy[7]*input_img[offset + columns_img] + gy[8]*input_img[offset + columns_img + 1];
sobel_pixel = round(sqrt(sobel_x*sobel_x + sobel_y*sobel_y));
if(sobel_pixel > 255)
{
sobel_pixel = 255;
}
if(sobel_pixel < 0)
{
sobel_pixel = 0;
}
}
else
{
sobel_pixel = 0;
}
d_sobel_img[offset] = sobel_pixel;
}
__global__ void ht_vote(int *input_img, int *hough_img, double *theta_vals, double *rho_vals, double *max_rho)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
if(offset>10000 && offset <10005 )//&& offset > 10000)
{
printf("index from ht_vote: %d \n", offset);
}
if((offset < size_img) && (input_img[offset] > 5))//arbitrary threshold
{
short i;
for (i = 0; i < resolution; i++)
{
double theta = (M_PI/resolution)*i;
double rho = x * cos(theta) + y * sin(theta);
short pos_x = (theta/M_PI)*resolution;
short pos_y = (rho/x)*resolution;
atomicAdd(&hough_img[pos_y * resolution + pos_x] , (int)1);
}
}
atomicAdd(&hough_img[offset] , offset);
}
int main(int argc, char *argv[])
{
//input_file should be grayscale image
FILE *input_file;
input_file = fopen("pic3_gray_txt.txt", "r");
int *input_img = (int *)malloc(sizeof(int) * ( size_img));
int *sobel_img = (int *)malloc(sizeof(int) * ( size_img));
int *d_input_img;
int *output_img = (int *)malloc(sizeof(int) * ( size_img));
long i;
for (i = 0; i < size_img; i++)
{
fscanf(input_file, "%d", &input_img[i]);
}
double *d_theta, *d_rho, *d_max_rho;
int *d_hough_trans, *d_sobel_img;
//the following variables currently serve no purpose and will most likely not be used in the final program
cudaMalloc(&d_theta, resolution*sizeof(double));
cudaMalloc(&d_rho, resolution*sizeof(double));
cudaMalloc(&d_max_rho, sizeof(double));
cudaMalloc(&d_hough_trans, size_img*sizeof(int));//hough transformation
cudaMalloc(&d_input_img, size_img*sizeof(int));//input of the program
cudaMalloc(&d_sobel_img, size_img*sizeof(int));//output after sobel
cudaMemcpy(d_input_img, input_img, (size_img)*sizeof(int), cudaMemcpyHostToDevice);
dim3 blocks((rows_img/16), (columns_img/16));//arbitrary
dim3 threads(16,16);//arbitrary
//start sobel
sobel<<<blocks, threads>>>(d_input_img, d_sobel_img);
cudaFree(d_input_img);
cudaMemcpy( sobel_img, d_sobel_img, size_img*sizeof(int),cudaMemcpyDeviceToHost);
//end sobel
//initialize all elements to 0
cudaMemset(d_hough_trans, 0, resolution*resolution*sizeof(int));
//d_max_rho and max_rho_global currently serve no purpose and will most likely be removed in the final implementation of the program
cudaMemcpy(d_max_rho, &max_rho_global,sizeof(double), cudaMemcpyHostToDevice);
dim3 blocks2((rows_img/16), (columns_img/16));//arbitrary
dim3 threads2(16,16);//arbitrary
ht_vote<<<blocks2,threads2>>>(d_sobel_img, d_hough_trans, d_theta, d_rho, d_max_rho);
cudaFree(d_sobel_img);
printf("ht_vote has finished\n");
//TEST
cudaMemcpy( output_img, d_hough_trans, size_img*sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(d_theta);
cudaFree(d_rho);
cudaFree(d_max_rho);
cudaFree(d_hough_trans);
cudaFree(d_input_img);
//write output to file
FILE *output = fopen("test2.txt", "wb");
for(i = 0; i < resolution * resolution; i++)
{
fprintf(output,"%d ", output_img[i]);
}
fclose(output);
//write output to file
FILE *output2 = fopen("sobel_test.txt", "wb");
for(i = 0; i < size_img; i++)
{
fprintf(output2,"%d ", sobel_img[i]);
}
fclose(output2);
return 0;
}
我正在使用CUDA编写程序,尽管输入文件相同,但我得到了不同的输出。我使用的是Ubuntu和geforce gtx 960 M GPU。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "device_functions.h"
//rows and columns
const int rows_img = 320;
const int columns_img = 640;
const double max_rho_global = sqrt((rows_img*rows_img) + (columns_img*columns_img));
const long size_img = rows_img * columns_img;
const short resolution = 180;//arbitrary
__global__ void sobel(int* input_img, int* d_sobel_img)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
short gx[9] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
short gy[9] = {1, 2, 1, 0, 0, 0, -1, -2, -1};
short sobel_pixel;
if(offset<2005 && offset > 2000)
{
printf("index: %d \n", offset);
}
//do not access values beyond image size and do not consider first and last row or first and last column
if((offset < size_img) && (offset > columns_img) && (offset < size_img - columns_img) && (offset % columns_img != 0) && (offset % columns_img != 1))
{
double sobel_x = gx[0]*input_img[offset - columns_img - 1] + gx[1]*input_img[offset - columns_img] + gx[2]*input_img[offset - columns_img + 1] + gx[3]*input_img[offset - 1] + gx[4]*input_img[offset] + gx[5]*input_img[offset + 1] + gx[6]*input_img[offset + columns_img - 1] + gx[7]*input_img[offset + columns_img] + gx[8]*input_img[offset + columns_img + 1];
double sobel_y = gy[0]*input_img[offset - columns_img - 1] + gy[1]*input_img[offset - columns_img] + gy[2]*input_img[offset - columns_img + 1] + gy[3]*input_img[offset - 1] + gy[4]*input_img[offset] + gy[5]*input_img[offset + 1] + gy[6]*input_img[offset + columns_img - 1] + gy[7]*input_img[offset + columns_img] + gy[8]*input_img[offset + columns_img + 1];
sobel_pixel = round(sqrt(sobel_x*sobel_x + sobel_y*sobel_y));
if(sobel_pixel > 255)
{
sobel_pixel = 255;
}
if(sobel_pixel < 0)
{
sobel_pixel = 0;
}
}
else
{
sobel_pixel = 0;
}
d_sobel_img[offset] = sobel_pixel;
}
__global__ void ht_vote(int *input_img, int *hough_img, double *theta_vals, double *rho_vals, double *max_rho)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
if(offset>10000 && offset <10005 )//&& offset > 10000)
{
printf("index from ht_vote: %d \n", offset);
}
if((offset < size_img) && (input_img[offset] > 5))//arbitrary threshold
{
short i;
for (i = 0; i < resolution; i++)
{
double theta = (M_PI/resolution)*i;
double rho = x * cos(theta) + y * sin(theta);
short pos_x = (theta/M_PI)*resolution;
short pos_y = (rho/x)*resolution;
atomicAdd(&hough_img[pos_y * resolution + pos_x] , (int)1);
}
}
atomicAdd(&hough_img[offset] , offset);
}
int main(int argc, char *argv[])
{
//input_file should be grayscale image
FILE *input_file;
input_file = fopen("pic3_gray_txt.txt", "r");
int *input_img = (int *)malloc(sizeof(int) * ( size_img));
int *sobel_img = (int *)malloc(sizeof(int) * ( size_img));
int *d_input_img;
int *output_img = (int *)malloc(sizeof(int) * ( size_img));
long i;
for (i = 0; i < size_img; i++)
{
fscanf(input_file, "%d", &input_img[i]);
}
double *d_theta, *d_rho, *d_max_rho;
int *d_hough_trans, *d_sobel_img;
//the following variables currently serve no purpose and will most likely not be used in the final program
cudaMalloc(&d_theta, resolution*sizeof(double));
cudaMalloc(&d_rho, resolution*sizeof(double));
cudaMalloc(&d_max_rho, sizeof(double));
cudaMalloc(&d_hough_trans, size_img*sizeof(int));//hough transformation
cudaMalloc(&d_input_img, size_img*sizeof(int));//input of the program
cudaMalloc(&d_sobel_img, size_img*sizeof(int));//output after sobel
cudaMemcpy(d_input_img, input_img, (size_img)*sizeof(int), cudaMemcpyHostToDevice);
dim3 blocks((rows_img/16), (columns_img/16));//arbitrary
dim3 threads(16,16);//arbitrary
//start sobel
sobel<<<blocks, threads>>>(d_input_img, d_sobel_img);
cudaFree(d_input_img);
cudaMemcpy( sobel_img, d_sobel_img, size_img*sizeof(int),cudaMemcpyDeviceToHost);
//end sobel
//initialize all elements to 0
cudaMemset(d_hough_trans, 0, resolution*resolution*sizeof(int));
//d_max_rho and max_rho_global currently serve no purpose and will most likely be removed in the final implementation of the program
cudaMemcpy(d_max_rho, &max_rho_global,sizeof(double), cudaMemcpyHostToDevice);
dim3 blocks2((rows_img/16), (columns_img/16));//arbitrary
dim3 threads2(16,16);//arbitrary
ht_vote<<<blocks2,threads2>>>(d_sobel_img, d_hough_trans, d_theta, d_rho, d_max_rho);
cudaFree(d_sobel_img);
printf("ht_vote has finished\n");
//TEST
cudaMemcpy( output_img, d_hough_trans, size_img*sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(d_theta);
cudaFree(d_rho);
cudaFree(d_max_rho);
cudaFree(d_hough_trans);
cudaFree(d_input_img);
//write output to file
FILE *output = fopen("test2.txt", "wb");
for(i = 0; i < resolution * resolution; i++)
{
fprintf(output,"%d ", output_img[i]);
}
fclose(output);
//write output to file
FILE *output2 = fopen("sobel_test.txt", "wb");
for(i = 0; i < size_img; i++)
{
fprintf(output2,"%d ", sobel_img[i]);
}
fclose(output2);
return 0;
}
当我执行代码时,我得到以下之一:
//output scenario 1:
index: 2001
index: 2002
index: 2003
index: 2004
ht_vote has finished
//output scenario 2:
index: 2001
index: 2002
index: 2003
index: 2004
index from ht_vote: 10001
index from ht_vote: 10002
index from ht_vote: 10003
index from ht_vote: 10004
ht_vote has finished
我得到的另一个违反直觉的例子如下:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "device_functions.h"
//rows and columns
const int rows_img = 320;
const int columns_img = 640;
const double max_rho_global = sqrt((rows_img*rows_img) + (columns_img*columns_img));
const long size_img = rows_img * columns_img;
int threads = 512;//threads per block
int blocks = size_img/threads + 1;
const short resolution = 180;//arbitrary
__global__ void ht_vote(int *input_img, int *hough_img, double *theta_vals, double *rho_vals, double *max_rho)
{
double x = *max_rho;
int index = threadIdx.x + blockDim.x * blockIdx.x;
if((input_img[index] > 0) && (index < size_img))//arbitrary threshold
{
if(index == 51)
{
printf("the index is 51 for this thread/block \n");
}
short i;
for (i = 0; i < resolution; i++)
{
theta_vals[i] = (M_PI/resolution)*i;
short x = index%columns_img;
short y = resolution - index/resolution;
rho_vals[i] = x * cos(theta_vals[i]) ;
if (index == 51 && i < 5)
{
printf("====================================\n");
printf("i: %d theta_vals[i]: %f \n", i, theta_vals[i]);
printf("index modulo columns_img: %d \n", index%columns_img);
printf("resolution - index/resolution: %d \n", resolution - index/resolution);
printf("x: %d \n", x);
printf("y: %d \n", y);
printf("x * cos(theta): %f \n", x * cos(theta_vals[i]));
printf("y * sin(theta): %f \n", y * sin(theta_vals[i]));
printf("i: %d rho_vals[i]: %f \n", i, rho_vals[i]);
printf("====================================\n");
}
}
short pos_x, pos_y;
for (i = 0; i < resolution; i++)
{
pos_x = (theta_vals[i]/M_PI)*resolution;
pos_y = (rho_vals[i]/x)*resolution;
atomicAdd(&hough_img[pos_y * resolution + pos_x] , (int)1);
}
}
}
int main(int argc, char *argv[])
{
//input_file should be grayscale image
FILE *input_file;
input_file = fopen("pic3_gray_txt.txt", "r");
int *input_img = (int *)malloc(sizeof(int) * ( size_img));
int *d_input_img;
int *output_img = (int *)malloc(sizeof(int) * ( size_img));
long i;
for (i = 0; i < size_img; i++)
{
fscanf(input_file, "%d", &input_img[i]);
}
double *d_theta, *d_rho, *d_max_rho;
int *d_hough_trans;
cudaMalloc(&d_theta, resolution*sizeof(double));
cudaMalloc(&d_rho, resolution*sizeof(double));
cudaMalloc(&d_max_rho, sizeof(double));
cudaMalloc(&d_hough_trans, resolution*resolution*sizeof(int));
cudaMalloc(&d_input_img, size_img*sizeof(int));
//initialize all elements to 0
cudaMemset(d_hough_trans, 0, resolution*resolution*sizeof(int));
cudaMemcpy(d_max_rho, &max_rho_global,sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_input_img, input_img, (size_img)*sizeof(int), cudaMemcpyHostToDevice);
ht_vote<<<blocks,threads>>>(d_input_img, d_hough_trans, d_theta, d_rho, d_max_rho);
printf("ht_vote has finished\n");
//TEST
cudaMemcpy( output_img, d_hough_trans, size_img*sizeof(int),cudaMemcpyDeviceToHost);
//cudaMemcpy( output_img, d_hough_trans, resolution*resolution*sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(d_theta);
cudaFree(d_rho);
cudaFree(d_max_rho);
cudaFree(d_hough_trans);
cudaFree(d_input_img);
//clock_t start = clock();//time
//write output to file
FILE *output = fopen("test2.txt", "wb");
for(i = 0; i < resolution * resolution; i++)
{
fprintf(output,"%d ", output_img[i]);
}
printf("resolution: %d \n", resolution);
fclose(output);
return 0;
}
上面的代码运行时会得到以下输出:
> ht_vote has finished the index is 51 for this thread/block
> ==================================== i: 0 theta_vals[i]: 0.000000 index modulo columns_img: 51 resolution - index/resolution: 180 x:
> 51 y: 180 x * cos(theta): 51.000000 y * sin(theta): 0.000000 i: 0
> rho_vals[i]: 63.000000
> ====================================
> ==================================== i: 1 theta_vals[i]: 0.017453 index modulo columns_img: 51 resolution - index/resolution: 180 x:
> 51 y: 180 x * cos(theta): 50.992232 y * sin(theta): 3.141433 i: 1
> rho_vals[i]: 574.912425
> ====================================
> ==================================== i: 2 theta_vals[i]: 0.034907 index modulo columns_img: 51 resolution - index/resolution: 180 x:
> 51 y: 180 x * cos(theta): 50.968932 y * sin(theta): 6.281909 i: 2
> rho_vals[i]: 574.649726
> ====================================
> ==================================== i: 3 theta_vals[i]: 0.052360 index modulo columns_img: 51 resolution - index/resolution: 180 x:
> 51 y: 180 x * cos(theta): 50.930106 y * sin(theta): 9.420472 i: 3
> rho_vals[i]: 542.255837
> ====================================
> ==================================== i: 4 theta_vals[i]: 0.069813 index modulo columns_img: 51 resolution - index/resolution: 180 x:
> 51 y: 180 x * cos(theta): 50.875767 y * sin(theta): 12.556165 i: 4
> rho_vals[i]: 445.911130
> ==================================== resolution: 180
我认为rho_vals和x * cos(theta)对于所有人来说应该是相同的,但是他们不是。
顺便说一下,这可能是Nvidia驱动程序的问题吗?我不得不对nvidia驱动程序进行一些更改,以防止ubuntu出现故障,但它在某种程度上仍然存在问题......