atomicAdd会产生错误的结果。我甚至在设备中初始化结果但仍无法正常工作。那是为什么?
这是我打电话的功能:
__global__
void getHammingDistance(char *str1, char *str2, int *result)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < 3)
{
if (str1[idx] != str2[idx])
{
atomicAdd(result, 1);
}
}
}
但atomicAdd会给出错误的结果。
int getDist()
{
int k = 9;
int min = INT_MAX;
char *dev_str1, *dev_str2;
int *dev_result;
int blockSize = 1024;
int gridSize = (int) ceil((float) 4 / blockSize);
int result = 0;
char *str1 = "AAA";
char *str2 = "ATG";
cudaMalloc((void**) &dev_str1, sizeof(char) *20);
cudaMalloc((void**) &dev_str2, sizeof(char) * 20);
cudaMalloc((void**) &dev_result, sizeof(int) * 10);
cudaMemcpy(dev_str1, &str1, 20 * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_str2, &str2, 20 * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, &result, 10 * sizeof(int), cudaMemcpyHostToDevice);
getHammingDistance<<<gridSize, blockSize>>>(dev_str1, dev_str2, dev_result);
cudaMemcpy(&result, dev_result, 10 * sizeof(int), cudaMemcpyDeviceToHost);
cout << result;
cudaFree(dev_str1);
cudaFree(dev_str2);
cudaFree(dev_result);
return result;
}
为什么?它应该有用。
答案 0 :(得分:1)
原始代码中的错误在这里
cudaMemcpy(dev_str1, &str1, 20 * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_str2, &str2, 20 * sizeof(char), cudaMemcpyHostToDevice);
这不是正确的签名。正确的签名是
cudaMemcpy(void * dst, const void * src, size_t count, enum cudaMemcpyKind kind)
您正在使用const void ** src
进行调用。即使在纠正之后,这也会复制大量垃圾,因为你的字符串只有三个字符。或多或少正确的代码
#include <iostream>
__global__
void getHammingDistance(char *str1, char *str2, int *result)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < 3)
{
if (str1[idx] != str2[idx])
{
atomicAdd(result, 1);
}
}
}
int getDist()
{
char *dev_str1, *dev_str2;
int *dev_result;
int blockSize = 1024;
int gridSize = (int) ceil((float) 4 / blockSize);
int result = 0;
char str1[] = "AAA";
char str2[] = "ATG";
int size = strlen(str1);
cudaMalloc(&dev_str1, 3 * sizeof(char));
cudaMalloc(&dev_str2, 3 * sizeof(char));
cudaMalloc(&dev_result, sizeof(int));
cudaMemcpy(dev_str1, str1, 3 * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_str2, str2, 3 * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, &result, sizeof(int), cudaMemcpyHostToDevice);
getHammingDistance<<<gridSize, blockSize>>>(dev_str1, dev_str2, dev_result);
cudaDeviceSynchronize();
cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << result << "\n";
cudaFree(dev_str1);
cudaFree(dev_str2);
cudaFree(dev_result);
return result;
}
int main() {
getDist();
}
更好地使用Thrust。相同的速度,更少的代码,更少的错误。
#include <cassert>
#include <iostream>
#include <string>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
struct HammingDistance {
__host__ __device__ int operator()(char x, char y) const {
return (x != y) ? 1 : 0;
}
};
int getDist(std::string const &str1, std::string const &str2)
{
assert(str1.size() == str2.size());
thrust::device_vector<char> str1_dev(str1.begin(), str1.end());
thrust::device_vector<char> str2_dev(str2.begin(), str2.end());
int init = 0;
return thrust::inner_product(str1_dev.begin(), str1_dev.end(), str2_dev.begin(),
init, thrust::plus<int>{}, HammingDistance{});
}
int main() {
std::string str1 = "AAA";
std::string str2 = "ATG";
std::cout << getDist(str1, str2) << "\n";
}