我的CUDA内核正在按键使用推力,排序和减少。 当我使用数组 超过460 时,它会开始显示不正确的结果。
任何人都可以解释这种行为吗?或者它与我的机器有关?
尽管规模很大,但排序正常,但REDUCE_BY_KEY效果不佳。并返回不正确的结果。
有关代码的更多详细信息, 我有4个阵列 1)输入键,定义为wholeSequenceArray。 2)在内核中定义的输入值,初始值为1。 3)输出键是保存输入键的不同值 4)输出值是保存与同一输入键对应的输入值之和。
有关reduce_by_key的更多说明,请访问此页面: https://thrust.github.io/doc/group__reductions.html#gad5623f203f9b3fdcab72481c3913f0e0
这是我的代码:
#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
using namespace std;
#define size 461
__global__ void calculateOccurances(unsigned int *input_keys,
unsigned int *output_Values) {
int tid = threadIdx.x;
const int N = size;
__shared__ unsigned int input_values[N];
unsigned int outputKeys[N];
int i = tid;
while (i < N) {
if (tid < N) {
input_values[tid] = 1;
}
i += blockDim.x;
}
__syncthreads();
thrust::sort(thrust::device, input_keys, input_keys + N);
thrust::reduce_by_key(thrust::device, input_keys, input_keys + N,
input_values, outputKeys, output_Values);
if (tid == 0) {
for (int i = 0; i < N; ++i) {
printf("%d,", output_Values[i]);
}
}
}
int main(int argc, char** argv) {
unsigned int wholeSequenceArray[size] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,1 };
cout << "wholeSequenceArray:" << endl;
for (int i = 0; i < size; i++) {
cout << wholeSequenceArray[i] << ",";
}
cout << "\nStart C++ Array New" << endl;
cout << "Size of Input:" << size << endl;
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
unsigned int counts[size];
unsigned int *d_whole;
unsigned int *d_counts;
cudaMalloc((void**) &d_whole, size * sizeof(unsigned int));
cudaMalloc((void**) &d_counts, size * sizeof(unsigned int));
cudaMemcpy(d_whole, wholeSequenceArray, size * sizeof(unsigned int),
cudaMemcpyHostToDevice);
calculateOccurances<<<1, size>>>(d_whole, d_counts);
cudaMemcpy(counts, d_counts, size * sizeof(unsigned int),
cudaMemcpyDeviceToHost);
cout << endl << "Counts" << endl << endl;
for (int i = 0; i < size; ++i) {
cout << counts[i] << ",";
}
cout << endl;
cudaFree(d_whole);
}
答案 0 :(得分:1)
当您在内核中调用推力算法时,将从每个CUDA线程完整地调度该推力算法。因此,您的代码在同一位置对相同数据(每个CUDA内核线程一次)执行461次排序操作。这意味着每个线程在排序操作期间移动数据时将相互踩踏。
如果您只想使用您在问题中概述的方法计算出现的数字(有效的直方图),并且您想要使用推力,则根本不需要编写CUDA内核。
如果你真的想在CUDA内核中(正确地)这样做,那么你需要将推力操作(sort和reduce_by_key)限制为仅从单个线程中执行。 (甚至这种方法也只限于一个区块)。
我并不认为第二种(CUDA内核)方法有多大意义,但为了完整性,我修改了代码以包含每种方法的正确示例。请注意,一旦执行缩小,打印出每个阵列中的所有461个条目就不再有任何意义,因此为了清楚起见,我将打印输出限制在每个阵列的前25个条目中:
$ cat t91.cu
#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/constant_iterator.h>
using namespace std;
#define size 461
__global__ void calculateOccurances(unsigned int *input_keys,
unsigned int *output_Values) {
int tid = threadIdx.x;
const int N = size;
__shared__ unsigned int input_values[N];
unsigned int outputKeys[N];
int i = tid;
while (i < N) {
if (tid < N) {
input_values[tid] = 1;
}
i += blockDim.x;
}
__syncthreads();
if (tid == 0){
thrust::sort(thrust::device, input_keys, input_keys + N);
thrust::reduce_by_key(thrust::device, input_keys, input_keys + N,
input_values, outputKeys, output_Values);
}
if (tid == 0) {
printf("from kernel:\n");
for (int i = 0; i < 25; ++i) {
printf("%d,", output_Values[i]);
}
}
}
int main(int argc, char** argv) {
unsigned int wholeSequenceArray[size] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,1 };
cout << "wholeSequenceArray:" << endl;
for (int i = 0; i < size; i++) {
cout << wholeSequenceArray[i] << ",";
}
cout << "\nStart C++ Array New" << endl;
cout << "Size of Input:" << size << endl;
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
//just using thrust
thrust::device_vector<int> d_seq(wholeSequenceArray, wholeSequenceArray+size);
thrust::device_vector<int> d_val_out(size);
thrust::device_vector<int> d_key_out(size);
thrust::sort(d_seq.begin(), d_seq.end());
int rsize = thrust::get<0>(thrust::reduce_by_key(d_seq.begin(), d_seq.end(), thrust::constant_iterator<int>(1), d_key_out.begin(), d_val_out.begin())) - d_key_out.begin();
std::cout << "rsize:" << rsize << std::endl;
std::cout << "Thrust keys:" << std::endl;
thrust::copy_n(d_key_out.begin(), rsize, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "Thrust vals:" << std::endl;
thrust::copy_n(d_val_out.begin(), rsize, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
// in a cuda kernel
unsigned int counts[size];
unsigned int *d_whole;
unsigned int *d_counts;
cudaMalloc((void**) &d_whole, size * sizeof(unsigned int));
cudaMalloc((void**) &d_counts, size * sizeof(unsigned int));
cudaMemcpy(d_whole, wholeSequenceArray, size * sizeof(unsigned int),
cudaMemcpyHostToDevice);
calculateOccurances<<<1, size>>>(d_whole, d_counts);
cudaMemcpy(counts, d_counts, size * sizeof(unsigned int),
cudaMemcpyDeviceToHost);
std::cout << "from Host:" << std::endl;
cout << endl << "Counts" << endl << endl;
for (int i = 0; i < 25; ++i) {
cout << counts[i] << ",";
}
cout << endl;
cudaFree(d_whole);
}
$ nvcc -arch=sm_61 -o t91 t91.cu
$ ./t91
wholeSequenceArray:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,
Start C++ Array New
Size of Input:461
Max threads per block: 1024
rsize:20
Thrust keys:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
Thrust vals:
24,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,
from kernel:
24,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,526324,526325,526325,526327,526329,from Host:
Counts
24,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,526324,526325,526325,526327,526329,
$
注意:
我在推力示例中包含了一个方法,因此您可以准确了解输出数组的大小。
推力方法应该可以独立于size
参数正常工作 - 受GPU限制(例如内存大小)的限制。 CUDA内核方法实际上只是从单个线程执行推力代码,因此运行超过1个块并不是真的合理。
您可以参考this question/answer,了解有关使用CUDA内核推力的更多讨论。