即使在初始化结果参数之后,CUDA atomicAdd也会产生错误的结果

时间:2017-12-20 22:14:17

标签: c++ cuda

atomicAdd会产生错误的结果。我甚至在设备中初始化结果但仍无法正常工作。那是为什么?

这是我打电话的功能:

__global__
void getHammingDistance(char *str1, char *str2, int *result)
{
    int idx = blockIdx.x*blockDim.x+threadIdx.x;

    if (idx < 3)
    {
        if (str1[idx] != str2[idx])
        {
            atomicAdd(result, 1);

        }
    }
}

但atomicAdd会给出错误的结果。

int getDist()
{

    int k = 9;
    int min = INT_MAX;
    char *dev_str1, *dev_str2;
    int *dev_result;
    int blockSize = 1024;
    int gridSize = (int) ceil((float) 4 / blockSize);
    int result = 0;


    char *str1 = "AAA";
    char *str2 = "ATG";


    cudaMalloc((void**) &dev_str1, sizeof(char)  *20);
    cudaMalloc((void**) &dev_str2, sizeof(char)  * 20);
    cudaMalloc((void**) &dev_result, sizeof(int) * 10);

    cudaMemcpy(dev_str1, &str1, 20 * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_str2, &str2, 20 * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_result, &result,  10 * sizeof(int), cudaMemcpyHostToDevice);

    getHammingDistance<<<gridSize, blockSize>>>(dev_str1, dev_str2, dev_result);
    cudaMemcpy(&result, dev_result, 10 * sizeof(int), cudaMemcpyDeviceToHost);

    cout << result;
    cudaFree(dev_str1);
    cudaFree(dev_str2);
    cudaFree(dev_result);


    return result;
} 

为什么?它应该有用。

1 个答案:

答案 0 :(得分:1)

原始代码中的错误在这里

cudaMemcpy(dev_str1, &str1, 20 * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_str2, &str2, 20 * sizeof(char), cudaMemcpyHostToDevice);

这不是正确的签名。正确的签名是

cudaMemcpy(void * dst, const void * src, size_t count, enum cudaMemcpyKind kind)

您正在使用const void ** src进行调用。即使在纠正之后,这也会复制大量垃圾,因为你的字符串只有三个字符。或多或少正确的代码

#include <iostream>

__global__
void getHammingDistance(char *str1, char *str2, int *result)
{
    int idx = blockIdx.x*blockDim.x+threadIdx.x;

    if (idx < 3)
    {
        if (str1[idx] != str2[idx])
        {
            atomicAdd(result, 1);

        }
    }
}

int getDist()
{

    char *dev_str1, *dev_str2;
    int *dev_result;
    int blockSize = 1024;
    int gridSize = (int) ceil((float) 4 / blockSize);
    int result = 0;


    char str1[] = "AAA";
    char str2[] = "ATG";
    int size = strlen(str1);


    cudaMalloc(&dev_str1, 3 * sizeof(char));
    cudaMalloc(&dev_str2, 3 * sizeof(char));
    cudaMalloc(&dev_result, sizeof(int));

    cudaMemcpy(dev_str1, str1, 3 * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_str2, str2, 3 * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_result, &result,  sizeof(int), cudaMemcpyHostToDevice);

    getHammingDistance<<<gridSize, blockSize>>>(dev_str1, dev_str2, dev_result);
    cudaDeviceSynchronize();
    cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << result << "\n";
    cudaFree(dev_str1);
    cudaFree(dev_str2);
    cudaFree(dev_result);

    return result;
}

int main() {
    getDist();
}

更好地使用Thrust。相同的速度,更少的代码,更少的错误。

#include <cassert>
#include <iostream>
#include <string>

#include <thrust/device_vector.h>
#include <thrust/inner_product.h>

struct HammingDistance {
    __host__ __device__ int operator()(char x, char y) const {
        return (x != y) ? 1 : 0;
    }
};

int getDist(std::string const &str1, std::string const &str2)
{
    assert(str1.size() == str2.size());

    thrust::device_vector<char> str1_dev(str1.begin(), str1.end());
    thrust::device_vector<char> str2_dev(str2.begin(), str2.end());

    int init = 0;
    return thrust::inner_product(str1_dev.begin(), str1_dev.end(), str2_dev.begin(),
                                 init, thrust::plus<int>{}, HammingDistance{});
}

int main() {
    std::string str1 = "AAA";
    std::string str2 = "ATG";

    std::cout << getDist(str1, str2) << "\n";
}