在相同的逻辑下,CUDA __device__函数的工作方式与__global__函数不同,__global__函数给出了错误的答案

时间:2019-06-23 03:51:23

标签: c++ cuda

Cuda的全局功能无法正确交换全局内存数据

我想沿上下方向镜像翻转矩阵,但是 global 函数会将数据重置为零,我不知道为什么?

我编写了一个名为“交换”的设备函数,但是它确实有效吗?

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cufft.h"

#include <iostream>
using namespace std;

// avoid memory leaking
struct cudaMemGuard {
    cudaMemGuard(void** memPtr) :m_ptr(memPtr) {    }
    ~cudaMemGuard() {
        cudaFree(*m_ptr);
    }
private:
    void **m_ptr = nullptr;
};

#define checkCudaErrors(a) do{if((a)!=cudaSuccess){cudaError_t errCode=cudaGetLastError(); \
    fprintf(stderr,"Cuda errors: %s, File: %s, Line: %d", \
    cudaGetErrorString(errCode), __FILE__, __LINE__);cudaDeviceReset();return errCode;}}while(0);

__device__ void Swap(double& x1, double &x2) {
    double temp = x1;
    x1 = x2;
    x2 = temp;
}

__global__ void fftShiftKernel(double *data,  int rows, int cols) {
    unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned idy = blockIdx.y*blockDim.y + threadIdx.y;
    unsigned index = idy * blockDim.x*gridDim.x + idx;
    if ((rows & 1) == 0) {
        unsigned gapX = cols / 2, gapY = rows / 2;
        if (idy < gapX) {
            unsigned  i = index, j = index + gapY * cols;
            //Swap(data[i], data[j]); // It does work!
            double temp = data[j];
            data[j] = data[i];
            data[i] = temp; // when this line executed, data[j] would be reset to 0 concurrently, why? But if I use the Swap function, it does work, I'm confused!
        }
    }
}

cudaError_t fftShift(double *data, double *result, int rows, int cols) {
    double *devDataPtr;
    cudaMemGuard devDataPtrG((void**)&devDataPtr);
    cudaSetDevice(0);
    unsigned byteNum = sizeof(double)*rows*cols;
    checkCudaErrors(cudaMalloc((void**)&devDataPtr, byteNum));
    checkCudaErrors(cudaMemcpy(devDataPtr, data, byteNum, cudaMemcpyHostToDevice));
    dim3 threadsPerBlock(16, 16, 1);
    dim3 blocksPerGrid((rows + 15) / 16, (cols + 15) / 16);
    fftShiftKernel << <blocksPerGrid, threadsPerBlock >> > (devDataPtr, rows, cols);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaMemcpy(result, devDataPtr, byteNum, cudaMemcpyDeviceToHost));
    cudaDeviceSynchronize();
    cudaDeviceReset();
    return cudaSuccess;
}

int main()
{
    int rows = 6, cols = 5;
    double *data = new double[rows*cols];
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            data[i*cols + j] = (double)i*cols + j; //{ (double)i*cols + j + 1,(double)i*cols + j + 1 };
            cout << data[i*cols + j] << " ";
        }
        cout << endl;
    }
    cout << endl;

    double *result = new double[rows*cols];
    fftShift(data, result, rows, cols);

    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            cout << result[i*cols + j] << " ";
        }
        cout << endl;
    }

    return 0;
}

我想在上下方向镜像翻转矩阵,但是在下半部分输出为零。我发现当执行“ double temp = data [j]; data [j] = data [i]; data [i] = temp;”时, global 函数将在执行最后一行代码后将数据重置为零,我不知道为什么?

0 个答案:

没有答案