Question

我正在使用CUDA编写程序，尽管输入文件相同，但我得到了不同的输出。我使用的是Ubuntu和geforce gtx 960 M GPU。

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <time.h>
    #include "device_functions.h"

    //rows and columns
    const int rows_img = 320;
    const int columns_img = 640;
    const double max_rho_global = sqrt((rows_img*rows_img) + (columns_img*columns_img));
    const long size_img = rows_img * columns_img;


    const short resolution = 180;//arbitrary

    __global__ void sobel(int* input_img, int* d_sobel_img)
    {
        int x = threadIdx.x + blockIdx.x * blockDim.x;
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        int offset = x + y * blockDim.x * gridDim.x;

        short gx[9] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
        short gy[9] = {1, 2, 1, 0, 0, 0, -1, -2, -1};

        short sobel_pixel;

        if(offset<2005 && offset > 2000)
        {
            printf("index: %d \n", offset);
        }

        //do not access values beyond image size and do not consider first and last row or first and last column
        if((offset < size_img) && (offset > columns_img) && (offset < size_img - columns_img) && (offset % columns_img != 0) && (offset % columns_img != 1))
        {

            double sobel_x = gx[0]*input_img[offset - columns_img - 1] + gx[1]*input_img[offset - columns_img] + gx[2]*input_img[offset - columns_img + 1] + gx[3]*input_img[offset - 1] +          gx[4]*input_img[offset] + gx[5]*input_img[offset + 1] + gx[6]*input_img[offset + columns_img - 1] + gx[7]*input_img[offset + columns_img] + gx[8]*input_img[offset + columns_img + 1];

            double sobel_y = gy[0]*input_img[offset - columns_img - 1] + gy[1]*input_img[offset - columns_img] + gy[2]*input_img[offset - columns_img + 1] + gy[3]*input_img[offset - 1] +          gy[4]*input_img[offset] + gy[5]*input_img[offset + 1] + gy[6]*input_img[offset + columns_img - 1] + gy[7]*input_img[offset + columns_img] + gy[8]*input_img[offset + columns_img + 1];

            sobel_pixel = round(sqrt(sobel_x*sobel_x + sobel_y*sobel_y));

            if(sobel_pixel > 255)
            {
                sobel_pixel = 255;
            }
            if(sobel_pixel < 0)
            {
                sobel_pixel = 0;
            }


        }
        else
        {
            sobel_pixel = 0;

        }

        d_sobel_img[offset] = sobel_pixel;
    }

    __global__ void ht_vote(int *input_img, int *hough_img, double *theta_vals, double *rho_vals, double *max_rho)
    {
        int x = threadIdx.x + blockIdx.x * blockDim.x;
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        int offset = x + y * blockDim.x * gridDim.x;

        if(offset>10000 && offset <10005 )//&& offset > 10000)
        {
            printf("index from ht_vote: %d \n", offset);
        }



        if((offset < size_img) && (input_img[offset] > 5))//arbitrary threshold
        {

            short i;
            for (i = 0; i < resolution; i++)
            {
                double theta = (M_PI/resolution)*i;
                double rho = x * cos(theta) + y * sin(theta);
                short pos_x = (theta/M_PI)*resolution;
                short pos_y = (rho/x)*resolution;
                atomicAdd(&hough_img[pos_y * resolution + pos_x] , (int)1);

            }
        }
        atomicAdd(&hough_img[offset] , offset);
    }

    int main(int argc, char *argv[]) 
    {

        //input_file should be grayscale image
        FILE *input_file;
        input_file = fopen("pic3_gray_txt.txt", "r");

        int *input_img = (int *)malloc(sizeof(int) * ( size_img));
        int *sobel_img = (int *)malloc(sizeof(int) * ( size_img));

        int *d_input_img;

        int *output_img = (int *)malloc(sizeof(int) * ( size_img));

        long i;
        for (i = 0; i < size_img; i++)
        {
            fscanf(input_file, "%d", &input_img[i]);
        }

        double *d_theta, *d_rho, *d_max_rho;
        int *d_hough_trans, *d_sobel_img;

        //the following variables currently serve no purpose and will most likely not be used in the final program
        cudaMalloc(&d_theta, resolution*sizeof(double));
        cudaMalloc(&d_rho, resolution*sizeof(double));
        cudaMalloc(&d_max_rho, sizeof(double));

        cudaMalloc(&d_hough_trans, size_img*sizeof(int));//hough transformation
        cudaMalloc(&d_input_img, size_img*sizeof(int));//input of the program
        cudaMalloc(&d_sobel_img, size_img*sizeof(int));//output after sobel

        cudaMemcpy(d_input_img, input_img, (size_img)*sizeof(int), cudaMemcpyHostToDevice);

        dim3 blocks((rows_img/16), (columns_img/16));//arbitrary
        dim3 threads(16,16);//arbitrary

        //start sobel

        sobel<<<blocks, threads>>>(d_input_img, d_sobel_img);

        cudaFree(d_input_img);
        cudaMemcpy( sobel_img, d_sobel_img, size_img*sizeof(int),cudaMemcpyDeviceToHost);

        //end sobel

        //initialize all elements to 0
        cudaMemset(d_hough_trans, 0, resolution*resolution*sizeof(int));

        //d_max_rho and max_rho_global currently serve no purpose and will most likely be removed in the final implementation of the program
        cudaMemcpy(d_max_rho, &max_rho_global,sizeof(double), cudaMemcpyHostToDevice);

        dim3 blocks2((rows_img/16), (columns_img/16));//arbitrary
        dim3 threads2(16,16);//arbitrary

        ht_vote<<<blocks2,threads2>>>(d_sobel_img, d_hough_trans, d_theta, d_rho, d_max_rho);

        cudaFree(d_sobel_img);

        printf("ht_vote has finished\n");

        //TEST
        cudaMemcpy( output_img, d_hough_trans, size_img*sizeof(int),cudaMemcpyDeviceToHost);

        cudaFree(d_theta);
        cudaFree(d_rho);
        cudaFree(d_max_rho);
        cudaFree(d_hough_trans);
        cudaFree(d_input_img);

        //write output to file
        FILE *output = fopen("test2.txt", "wb");
        for(i = 0; i < resolution * resolution; i++)
            {
                fprintf(output,"%d ", output_img[i]);
            }

        fclose(output);

        //write output to file
        FILE *output2 = fopen("sobel_test.txt", "wb");
        for(i = 0; i < size_img; i++)
            {
                fprintf(output2,"%d ", sobel_img[i]);
            }

        fclose(output2);

        return 0;
    }

我正在使用CUDA编写程序，尽管输入文件相同，但我得到了不同的输出。我使用的是Ubuntu和geforce gtx 960 M GPU。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "device_functions.h"

//rows and columns
const int rows_img = 320;
const int columns_img = 640;
const double max_rho_global = sqrt((rows_img*rows_img) + (columns_img*columns_img));
const long size_img = rows_img * columns_img;


const short resolution = 180;//arbitrary

__global__ void sobel(int* input_img, int* d_sobel_img)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    short gx[9] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
    short gy[9] = {1, 2, 1, 0, 0, 0, -1, -2, -1};

    short sobel_pixel;

    if(offset<2005 && offset > 2000)
    {
        printf("index: %d \n", offset);
    }

    //do not access values beyond image size and do not consider first and last row or first and last column
    if((offset < size_img) && (offset > columns_img) && (offset < size_img - columns_img) && (offset % columns_img != 0) && (offset % columns_img != 1))
    {

        double sobel_x = gx[0]*input_img[offset - columns_img - 1] + gx[1]*input_img[offset - columns_img] + gx[2]*input_img[offset - columns_img + 1] + gx[3]*input_img[offset - 1] +          gx[4]*input_img[offset] + gx[5]*input_img[offset + 1] + gx[6]*input_img[offset + columns_img - 1] + gx[7]*input_img[offset + columns_img] + gx[8]*input_img[offset + columns_img + 1];

        double sobel_y = gy[0]*input_img[offset - columns_img - 1] + gy[1]*input_img[offset - columns_img] + gy[2]*input_img[offset - columns_img + 1] + gy[3]*input_img[offset - 1] +          gy[4]*input_img[offset] + gy[5]*input_img[offset + 1] + gy[6]*input_img[offset + columns_img - 1] + gy[7]*input_img[offset + columns_img] + gy[8]*input_img[offset + columns_img + 1];

        sobel_pixel = round(sqrt(sobel_x*sobel_x + sobel_y*sobel_y));

        if(sobel_pixel > 255)
        {
            sobel_pixel = 255;
        }
        if(sobel_pixel < 0)
        {
            sobel_pixel = 0;
        }


    }
    else
    {
        sobel_pixel = 0;

    }

    d_sobel_img[offset] = sobel_pixel;
}

__global__ void ht_vote(int *input_img, int *hough_img, double *theta_vals, double *rho_vals, double *max_rho)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    if(offset>10000 && offset <10005 )//&& offset > 10000)
    {
        printf("index from ht_vote: %d \n", offset);
    }



    if((offset < size_img) && (input_img[offset] > 5))//arbitrary threshold
    {

        short i;
        for (i = 0; i < resolution; i++)
        {
            double theta = (M_PI/resolution)*i;
            double rho = x * cos(theta) + y * sin(theta);
            short pos_x = (theta/M_PI)*resolution;
            short pos_y = (rho/x)*resolution;
            atomicAdd(&hough_img[pos_y * resolution + pos_x] , (int)1);

        }
    }
    atomicAdd(&hough_img[offset] , offset);
}

int main(int argc, char *argv[]) 
{

    //input_file should be grayscale image
    FILE *input_file;
    input_file = fopen("pic3_gray_txt.txt", "r");

    int *input_img = (int *)malloc(sizeof(int) * ( size_img));
    int *sobel_img = (int *)malloc(sizeof(int) * ( size_img));

    int *d_input_img;

    int *output_img = (int *)malloc(sizeof(int) * ( size_img));

    long i;
    for (i = 0; i < size_img; i++)
    {
        fscanf(input_file, "%d", &input_img[i]);
    }

    double *d_theta, *d_rho, *d_max_rho;
    int *d_hough_trans, *d_sobel_img;

    //the following variables currently serve no purpose and will most likely not be used in the final program
    cudaMalloc(&d_theta, resolution*sizeof(double));
    cudaMalloc(&d_rho, resolution*sizeof(double));
    cudaMalloc(&d_max_rho, sizeof(double));

    cudaMalloc(&d_hough_trans, size_img*sizeof(int));//hough transformation
    cudaMalloc(&d_input_img, size_img*sizeof(int));//input of the program
    cudaMalloc(&d_sobel_img, size_img*sizeof(int));//output after sobel

    cudaMemcpy(d_input_img, input_img, (size_img)*sizeof(int), cudaMemcpyHostToDevice);

    dim3 blocks((rows_img/16), (columns_img/16));//arbitrary
    dim3 threads(16,16);//arbitrary

    //start sobel

    sobel<<<blocks, threads>>>(d_input_img, d_sobel_img);

    cudaFree(d_input_img);
    cudaMemcpy( sobel_img, d_sobel_img, size_img*sizeof(int),cudaMemcpyDeviceToHost);

    //end sobel

    //initialize all elements to 0
    cudaMemset(d_hough_trans, 0, resolution*resolution*sizeof(int));

    //d_max_rho and max_rho_global currently serve no purpose and will most likely be removed in the final implementation of the program
    cudaMemcpy(d_max_rho, &max_rho_global,sizeof(double), cudaMemcpyHostToDevice);

    dim3 blocks2((rows_img/16), (columns_img/16));//arbitrary
    dim3 threads2(16,16);//arbitrary

    ht_vote<<<blocks2,threads2>>>(d_sobel_img, d_hough_trans, d_theta, d_rho, d_max_rho);

    cudaFree(d_sobel_img);

    printf("ht_vote has finished\n");

    //TEST
    cudaMemcpy( output_img, d_hough_trans, size_img*sizeof(int),cudaMemcpyDeviceToHost);

    cudaFree(d_theta);
    cudaFree(d_rho);
    cudaFree(d_max_rho);
    cudaFree(d_hough_trans);
    cudaFree(d_input_img);

    //write output to file
    FILE *output = fopen("test2.txt", "wb");
    for(i = 0; i < resolution * resolution; i++)
        {
            fprintf(output,"%d ", output_img[i]);
        }

    fclose(output);

    //write output to file
    FILE *output2 = fopen("sobel_test.txt", "wb");
    for(i = 0; i < size_img; i++)
        {
            fprintf(output2,"%d ", sobel_img[i]);
        }

    fclose(output2);

    return 0;
}

当我执行代码时，我得到以下之一：

//output scenario 1:

index: 2001 
index: 2002 
index: 2003 
index: 2004 
ht_vote has finished

//output scenario 2:

index: 2001 
index: 2002 
index: 2003 
index: 2004 
index from ht_vote: 10001 
index from ht_vote: 10002 
index from ht_vote: 10003 
index from ht_vote: 10004 
ht_vote has finished

我得到的另一个违反直觉的例子如下：

    #include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "device_functions.h"

//rows and columns
const int rows_img = 320;
const int columns_img = 640;
const double max_rho_global = sqrt((rows_img*rows_img) + (columns_img*columns_img));
const long size_img = rows_img * columns_img;

int threads = 512;//threads per block
int blocks = size_img/threads + 1;

const short resolution = 180;//arbitrary

__global__ void ht_vote(int *input_img, int *hough_img, double *theta_vals, double *rho_vals, double *max_rho)
{
    double x = *max_rho;
    int index = threadIdx.x + blockDim.x * blockIdx.x;

    if((input_img[index] > 0) && (index < size_img))//arbitrary threshold
    {

        if(index == 51)
            {
                printf("the index is 51 for this thread/block \n");
            }

        short i;
        for (i = 0; i < resolution; i++)
        {
            theta_vals[i] = (M_PI/resolution)*i;
            short x = index%columns_img;
            short y = resolution - index/resolution;
            rho_vals[i] = x * cos(theta_vals[i]) ;

            if (index == 51 && i < 5)
            {       
                    printf("====================================\n");
                    printf("i: %d theta_vals[i]: %f \n", i, theta_vals[i]);
                    printf("index modulo columns_img: %d \n", index%columns_img);
                    printf("resolution - index/resolution: %d \n", resolution - index/resolution);

                    printf("x: %d \n", x);
                    printf("y: %d \n", y);
                    printf("x * cos(theta): %f \n", x * cos(theta_vals[i]));
                    printf("y * sin(theta): %f \n", y * sin(theta_vals[i]));
                    printf("i: %d rho_vals[i]: %f \n", i, rho_vals[i]);
                    printf("====================================\n");
            }
        }

        short pos_x, pos_y;
        for (i = 0; i < resolution; i++)
        {
            pos_x = (theta_vals[i]/M_PI)*resolution;
            pos_y = (rho_vals[i]/x)*resolution;
            atomicAdd(&hough_img[pos_y * resolution + pos_x] , (int)1);
        }
    }
}

int main(int argc, char *argv[]) 
{
    //input_file should be grayscale image
    FILE *input_file;
    input_file = fopen("pic3_gray_txt.txt", "r");

    int *input_img = (int *)malloc(sizeof(int) * ( size_img));

    int *d_input_img;

    int *output_img = (int *)malloc(sizeof(int) * ( size_img));

    long i;
    for (i = 0; i < size_img; i++)
    {
        fscanf(input_file, "%d", &input_img[i]);
    }


    double *d_theta, *d_rho, *d_max_rho;
    int *d_hough_trans;
    cudaMalloc(&d_theta, resolution*sizeof(double));
    cudaMalloc(&d_rho, resolution*sizeof(double));
    cudaMalloc(&d_max_rho, sizeof(double));
    cudaMalloc(&d_hough_trans, resolution*resolution*sizeof(int));
    cudaMalloc(&d_input_img, size_img*sizeof(int));

    //initialize all elements to 0
    cudaMemset(d_hough_trans, 0, resolution*resolution*sizeof(int));

    cudaMemcpy(d_max_rho, &max_rho_global,sizeof(double), cudaMemcpyHostToDevice);

    cudaMemcpy(d_input_img, input_img, (size_img)*sizeof(int), cudaMemcpyHostToDevice);

    ht_vote<<<blocks,threads>>>(d_input_img, d_hough_trans, d_theta, d_rho, d_max_rho);

    printf("ht_vote has finished\n");

    //TEST
    cudaMemcpy( output_img, d_hough_trans, size_img*sizeof(int),cudaMemcpyDeviceToHost);

    //cudaMemcpy( output_img, d_hough_trans, resolution*resolution*sizeof(int),cudaMemcpyDeviceToHost);

    cudaFree(d_theta);
    cudaFree(d_rho);
    cudaFree(d_max_rho);
    cudaFree(d_hough_trans);
    cudaFree(d_input_img);

    //clock_t start = clock();//time

    //write output to file
    FILE *output = fopen("test2.txt", "wb");
    for(i = 0; i < resolution * resolution; i++)
        {
            fprintf(output,"%d ", output_img[i]);
        }

    printf("resolution: %d \n", resolution);
    fclose(output);

    return 0;
}

上面的代码运行时会得到以下输出：

    > ht_vote has finished the index is 51 for this thread/block 
> ==================================== i: 0 theta_vals[i]: 0.000000  index modulo columns_img: 51  resolution - index/resolution: 180  x:
> 51  y: 180  x * cos(theta): 51.000000  y * sin(theta): 0.000000  i: 0
> rho_vals[i]: 63.000000 
> ====================================
> ==================================== i: 1 theta_vals[i]: 0.017453  index modulo columns_img: 51  resolution - index/resolution: 180  x:
> 51  y: 180  x * cos(theta): 50.992232  y * sin(theta): 3.141433  i: 1
> rho_vals[i]: 574.912425 
> ====================================
> ==================================== i: 2 theta_vals[i]: 0.034907  index modulo columns_img: 51  resolution - index/resolution: 180  x:
> 51  y: 180  x * cos(theta): 50.968932  y * sin(theta): 6.281909  i: 2
> rho_vals[i]: 574.649726 
> ====================================
> ==================================== i: 3 theta_vals[i]: 0.052360  index modulo columns_img: 51  resolution - index/resolution: 180  x:
> 51  y: 180  x * cos(theta): 50.930106  y * sin(theta): 9.420472  i: 3
> rho_vals[i]: 542.255837 
> ====================================
> ==================================== i: 4 theta_vals[i]: 0.069813  index modulo columns_img: 51  resolution - index/resolution: 180  x:
> 51  y: 180  x * cos(theta): 50.875767  y * sin(theta): 12.556165  i: 4
> rho_vals[i]: 445.911130 
> ==================================== resolution: 180

我认为rho_vals和x * cos（theta）对于所有人来说应该是相同的，但是他们不是。

顺便说一下，这可能是Nvidia驱动程序的问题吗？我不得不对nvidia驱动程序进行一些更改，以防止ubuntu出现故障，但它在某种程度上仍然存在问题......

Cuda - 为相同的输入获得不同的输出。数组也显示错误的值

0 个答案: