__global__ void sort_single(int *size , int *arr){
for ( int m = 0; m < *size / 2; m++)
{
for (int i = 0; i < *size; i += 2)
{
if (arr[i + 1] > arr[i])
{
int temp = arr[i];
arr[i] = arr[i + 1];
arr[i + 1] = temp;
}
}
/*for (int i = 0; i < size; i++)
printf("%d ", arr[i]);
printf("\n");*/
for ( int i = 1; i < *size; i += 2)
{
if (arr[i + 1] > arr[i])
{
int temp = arr[i];
arr[i] = arr[i + 1];
arr[i + 1] = temp;
}
}
}
}
这是CUDA GPU的内核代码。它做了一个奇怪的排序。
为了调用它,我在准备好所有数据后从main使用它。
sort_single<<<1,1>>>(d_a,d_b);
我的问题是,为什么它在这里给出不正确的结果,而如果我将此代码作为常规C / C ++函数代码运行,它会给出正确的结果。如果我删除内核中的外部循环并在该循环中调用内核它可以正常工作下方。
for ( int m = 0; m < N / 2; m++)
sort_single<<<1,1>>>(d_a,d_b);
我在这里做同样的事情 我认为它必须对这个算法所需的步数做一些事情,例如每次迭代都需要两个步骤。
将偶数索引与下一个索引进行比较。
将奇数索引与下一个索引进行比较。
我无法理解为什么会因为我使用单GPU线程而增加数组中元素的数量而发生这种情况。我需要清楚GPU单线程与CPU的不同之处,以便了解当前的行为。 the whole single file link on drive
以下是文件内容:
#include "stdio.h"
__global__ void add(int *a , int *b ,int*c){
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
__global__ void sort_single(int *size , int *arr){
for ( int m = 0; m < *size / 2; m++)
{
for (int i = 0; i < *size; i += 2)
{
if (arr[i + 1] > arr[i])
{
int temp = arr[i];
arr[i] = arr[i + 1];
arr[i + 1] = temp;
}
}
/*for (int i = 0; i < size; i++)
printf("%d ", arr[i]);
printf("\n");*/
for ( int i = 1; i < *size; i += 2)
{
if (arr[i + 1] > arr[i])
{
int temp = arr[i];
arr[i] = arr[i + 1];
arr[i + 1] = temp;
}
}
}
}
void random_ints(int *a, int N)
{
int i;
for (i = 0; i < N; ++i)
a[i] = rand() %5000;
}
void uniform_ints(int *a, int N)
{
int i;
for (i = 0; i < N; ++i)
a[i] = i+1;
}
int main(int argc , char**argv){
int N = 8;
if(argc>1)
{
N=atoi(argv[1]);
}
int *a , *b ;
int *d_a , *d_b ;
int isize = N * sizeof(int);
a = (int *)malloc(sizeof(int));a[0] = N;
b = (int *)malloc(isize);uniform_ints(b , N);
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_a,sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_b,isize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(d_a, a , sizeof(int),cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(d_b, b , isize,cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
sort_single<<<1,1>>>(d_a,d_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(b, d_b , isize,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
for (int i = 0; i < N; i++)
printf("%d ", b[i]);
printf("\n");
Error:
cudaFree(d_a);
cudaFree(d_b);
return cudaStatus;
}
答案 0 :(得分:-1)
代码编译简单的C / C ++函数以及cuda内核,因为它基于数组大小均匀的假设。它给出了正确的结果,错误是mallocking。无论如何,谢谢你的回应。