Question

我正在编写合并排序的cuda版本，如果我使用cudaMemcpyDeviceToHost来获取GPU中的元素列表，它会给出内存错误，另一方面如果我正在注释掉该行，则程序不会排序正常。任何人都可以建议。

/* C program for Merge Sort with Cuda Technology*/
 #include<stdlib.h>
 #include<stdio.h>
 #include <cuda.h>
 #include <sys/time.h>

 #define THR1 1000
 #define THR2 10000

 #define N 800000

 /*
 ********************************
  Program UTILITY Code here
 ********************************
 */

 static void HandleError( cudaError_t err, const char *file, int line ) {
 if (err != cudaSuccess) {
 printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
 exit( EXIT_FAILURE );
 }}

 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

 /* Function to print an array */
 void printArray(int A[], int size)
 {
 int i;
 for (i=0; i < size; i++)
    printf("%d ", A[i]);
 printf("\n");
}


 //Function to test if the output is in asending order or not
 void test(int a[], int n) {
  int i;
 for (i=1;i<n;++i) {
 if (a[i]<a[i-1]) {
  break;
 }
 }
if (i<n) { 
 for (i=1;i<n;++i) {  
   if (a[i]>a[i-1]){
      break;
   }
}
if (i<n) {
  printf("\nArray is not sorted\n");
 }
 }
 else {
 printf("\nArray is sorted\n");
 }
 }
/*
*****************************************
 Sequential Version here
*****************************************
*/

 void insertionSort(int array[], int min, int max)
{
int key ;
// we loop through all elements in the original array from the min + 1 element
for (int j = min + 1 ; j <= max ; j++)
{
    // store the current element as the key
    key = array[j] ;
    // get the element just before the current element
    int i = j - 1 ;
    // loop through all elements from the key to the min element
    // check if the current element is smaller than the key
    while (i >= min && array[i] > key)
    {
        // we move the current element backward
        array[i+1] = array[i] ;
        i-- ;
    }
    // we finally move the key
    array[i+1] = key ;
}
}



 void merge(int array[], int min, int max, int mid)
{
int firstIndex = min;
int secondIndex = mid + 1;
int * tempArray = new int [max + 1];


// While there are elements in the left or right runs
for (int index = min; index <= max; index++) {
    // If left run head exists and is <= existing right run head.
    if (firstIndex <= mid && (secondIndex > max || array[firstIndex] <= array[secondIndex]))
    {
        tempArray[index] = array[firstIndex];
        firstIndex = firstIndex + 1;
    }

    else
    {
        tempArray[index] = array[secondIndex];
        secondIndex = secondIndex + 1;
    }

} 

// transfer to the initial array
for (int index = min ; index <= max ; index++)
    array[index] = tempArray[index];
  }



 void smergeSort(int array[], int min, int max, int threshold)
{
// prerequisite
if ( (max - min + 1) <= threshold )
{
    insertionSort(array, min, max);
}
else
{
    // get the middle point
    int mid = (max+min) / 2;

    // apply merge sort to both parts of this
    smergeSort(array, min, mid, threshold);
    smergeSort(array, mid+1, max, threshold);

    // and finally merge all that sorted stuff
    merge(array, min, max, mid) ;
}
}

 /*
 *****************************************
 Parallel Version here
 *****************************************
 */
 __device__ void gpu_bottomUpMerge(int* source, int* dest, int start, int middle, int end) {
int i = start;
int j = middle;
for (int k = start; k < end; k++) {
    if (i < middle && (j >= end || source[i] < source[j])) {
        dest[k] = source[i];
        i++;
    } else {
        dest[k] = source[j];
        j++;
    }
}
}

__global__ void gpu_mergesort(int* source, int* dest, int size, int width, int slices, dim3* threads, dim3* blocks) {

int idx = blockDim .x * blockIdx .x + threadIdx .x;

int start = width*idx*slices, 
     middle, 
     end;

for (int slice = 0; slice < slices; slice++) {
    if (start >= size)
        break;

    middle = min(start + (width >> 1), size);
    end = min(start + width, size);
    gpu_bottomUpMerge(source, dest, start, middle, end);
    start += width;
}
}

void mergesort(int* data, int size, dim3 threadsPerBlock, dim3 blocksPerGrid) {

// Allocate two arrays on the GPU we switch back and forth between them during the sort

int* D_data;
int* D_swp;
dim3* D_threads;
dim3* D_blocks;

// Actually allocate the two arrays

HANDLE_ERROR(cudaMalloc((void**) &D_data, size * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &D_swp, size * sizeof(int)));

// Copy from our input list into the first array
HANDLE_ERROR(cudaMemcpy(D_data, data, size * sizeof(int), cudaMemcpyHostToDevice));  

int* A = D_data;
int* B = D_swp;

int nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z * blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;

// Divide the list and give pieces of it to each thread, letting the pieces grow bigger and bigger until the whole list is sorted
for (int width = 2; width < (size << 1); width <<= 1) {
    int slices = size / ((nThreads) * width) + 1;

    // Actually call the kernel
    gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
    cudaDeviceSynchronize();

    // Switch the input / output arrays instead of copying them around
    A = A == D_data ? D_swp : D_data;
    B = B == D_data ? D_swp : D_data;
}

// Get the list back from the GPU 
HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost));

// Free the GPU memory
HANDLE_ERROR(cudaFree(A));
HANDLE_ERROR(cudaFree(B));

}




 /* Driver program to test above functions */
 int main()
{


dim3 threadsPerBlock;
dim3 blocksPerGrid;

threadsPerBlock.x = 224;
blocksPerGrid.x = 10; 

int i, *a;

 printf("How many elements in the array? ");

 a = (int *)malloc(sizeof(int) * N);        
 srand(time(0));
 for(i=0;i<N;i++)
       {
         a[i]=rand()%1000;
       }    
 printf("List Before Sorting...\n");
// printArray(a, N);

if (N<=THR2)
{
 clock_t begin = clock();
 smergeSort(a, 0, N - 1, THR2);
 clock_t end = clock(); 
 printf("\nSorted array:  ");
 //printArray(a,N);
 printf("\n");
 test(a,N);
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("SM");
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d",N);

 exit(0);
}      

else 
{
 clock_t begin = clock();
 mergesort(a, N, threadsPerBlock, blocksPerGrid);
 clock_t end = clock();
 printf("\nSorted array:  ");
 //printArray(a,N);
 printf("\n");
 test(a,N);
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("Cuda");
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d\n",N);

 exit(0);
}      
}

现在程序工作正常，即使对于大型元素，但是，当我使用大量线程时，让我们说块10和线程224;给出错误： - 在mergesort.cu中遇到非法内存访问在第215行

调试代码后，我再次遇到错误： -

========= Invalid __global__ read of size 4
=========     at 0x00000148 in 
/home/sharmpra/mergesort.cu:150:gpu_mergesort(int*, int*, int, int, int, dim3*, dim3*)
=========     by thread (96,0,0) in block (9,0,0)
=========     Address 0x915fc0000 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
=========     Host Frame:./a.out [0x1e831]
=========     Host Frame:./a.out [0x3c3d3]
=========     Host Frame:./a.out [0x38a8]
=========     Host Frame:./a.out [0x37b1]
=========     Host Frame:./a.out [0x3810]
=========     Host Frame:./a.out [0x33d1]
=========     Host Frame:./a.out [0x35ae]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
=========     Host Frame:./a.out [0x2bc9]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x2ef503]
=========     Host Frame:./a.out [0x3c0f6]
=========     Host Frame:./a.out [0x33da]
=========     Host Frame:./a.out [0x35ae]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
=========     Host Frame:./a.out [0x2bc9]
=========

请有人建议我在程序中可以执行哪些其他操作来删除此类错误。此外，我正在使用此命令行设置：-nvcc -o a.out -Wno-deprecated-gpu-targets -lineinfo -arch = compute_20，sm_20 -rdc = true -lcudadevrt mergesort.cu

Answer 1

正如@Robert所解释的那样，代码是从全局内存中读取source [i]和source [j]，每个都是int数量（大小为4个字节），所以我试图避免使用相同的数组进行比较我补充说： - for（int k = start; k＆lt; end; k ++） dest [k] = source [k]; 在gpu_bottomUpMerge中，通过在我的代码中添加这一行，它可以处理更多的块和线程但仍然为大量元素提供非法内存错误，因此，为了解决该问题，我使用了指针而不是int，我使用了很长时间。以下是该计划的更新版本：

/* C program for Merge Sort with Cuda Technology*/
#include<stdlib.h>
#include<stdio.h>
#include <cuda.h>
#include <sys/time.h>

#define THR1 1000
#define THR2 10000

#define N 800000

/*
********************************
Program UTILITY Code here
********************************
*/

static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

/* Function to print an array */
void printArray(int A[], int size)
{
int i;
for (i=0; i < size; i++)
    printf("%d ", A[i]);
printf("\n");
}


//Function to test if the output is in ascending order or not
void test(int a[], int n) {
int i;
for (i=1;i<n;++i) {
if (a[i]<a[i-1]) {
  break;
}
}
if (i<n) { 
for (i=1;i<n;++i) {  
   if (a[i]>a[i-1]){
      break;
   }
 }
if (i<n) {
  printf("\nArray is not sorted\n");
}
}
else {
printf("\nArray is sorted\n");
}
}
/*
*****************************************
Sequential Version here
*****************************************
*/

void insertionSort(int array[], int min, int max)
{
int key ;
// we loop through all elements in the original array from the min + 1 element
for (int j = min + 1 ; j <= max ; j++)
{
    // store the current element as the key
    key = array[j] ;
    // get the element just before the current element
    int i = j - 1 ;
    // loop through all elements from the key to the min element
    // check if the current element is smaller than the key
    while (i >= min && array[i] > key)
    {
        // we move the current element backward
        array[i+1] = array[i] ;
        i-- ;
    }
    // we finally move the key
    array[i+1] = key ;
 }
 }



void merge(int array[], int min, int max, int mid)
{
int firstIndex = min;
int secondIndex = mid + 1;
int * tempArray = new int [max + 1];


// While there are elements in the left or right runs
for (int index = min; index <= max; index++) {
    // If left run head exists and is <= existing right run head.
    if (firstIndex <= mid && (secondIndex > max || array[firstIndex] <= array[secondIndex]))
    {
        tempArray[index] = array[firstIndex];
        firstIndex = firstIndex + 1;
    }

    else
    {
        tempArray[index] = array[secondIndex];
        secondIndex = secondIndex + 1;
    }

} 

// transfer to the initial array
for (int index = min ; index <= max ; index++)
    array[index] = tempArray[index];
}



void smergeSort(int array[], int min, int max, int threshold)
{
// prerequisite
if ( (max - min + 1) <= threshold )
{
    insertionSort(array, min, max);
}
else
{
    // get the middle point
    int mid = (max+min) / 2;

    // apply merge sort to both parts of this
    smergeSort(array, min, mid, threshold);
    smergeSort(array, mid+1, max, threshold);

    // and finally merge all that sorted stuff
    merge(array, min, max, mid) ;
 }
 }

/*
*****************************************
 Parallel Version here
*****************************************
*/
 __device__ void gpu_bottomUpMerge(int* source, int* dest, int start, int middle, int end) {
int i = start;
int j = middle;
for (int k = start; k < end; k++)
dest[k] = source[k];
for (int k = start; k < end; k++) {
    if (i < middle && (j >= end || source[i] < dest[j])) {
        dest[k] = source[i];
        i++;
    } else {
        dest[k] = source[j];
        j++;
    }
 }
}

__global__ void gpu_mergesort(int* source, int* dest, int size, int width, int slices, dim3* threads, dim3* blocks) {

int idx = blockDim .x * blockIdx .x + threadIdx .x;

int start = width*idx*slices, 
     middle, 
     end;

for (int slice = 0; slice < slices; slice++) {
    if (start >= size)
        break;

    middle = min(start + (width >> 1), size);
    end = min(start + width, size);
    gpu_bottomUpMerge(source, dest, start, middle, end);
    start += width;
}
}

void mergesort(int* data, int size, dim3 threadsPerBlock, dim3 blocksPerGrid) {

// Allocate two arrays on the GPU we switch back and forth between them during the sort

int* D_data;
int* D_swp;
dim3* D_threads;
dim3* D_blocks;

// Actually allocate the two arrays

HANDLE_ERROR(cudaMalloc((void**) &D_data, size * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &D_swp, size * sizeof(int)));

// Copy from our input list into the first array
HANDLE_ERROR(cudaMemcpy(D_data, data, size * sizeof(int), cudaMemcpyHostToDevice));  

int* A = D_data;
int* B = D_swp;

int nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z * blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;

// Divide the list and give pieces of it to each thread, letting the pieces grow bigger and bigger until the whole list is sorted
for (int width = 2; width < (size << 1); width <<= 1) {
    int slices = size / ((nThreads) * width) + 1;

    // Actually call the kernel
    gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
    cudaDeviceSynchronize();

    // Switch the input / output arrays instead of copying them around
    A = A == D_data ? D_swp : D_data;
    B = B == D_data ? D_swp : D_data;
}

// Get the list back from the GPU 
HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost));

// Free the GPU memory
HANDLE_ERROR(cudaFree(A));
HANDLE_ERROR(cudaFree(B));

}




/* Driver program to test above functions */
int main()
{


dim3 threadsPerBlock;
dim3 blocksPerGrid;

threadsPerBlock.x = 122;
blocksPerGrid.x = 1; 

int i, *a;

 printf("How many elements in the array? ");

 a = (int *)malloc(sizeof(int) * N);        
 srand(time(0));
 for(i=0;i<N;i++)
       {
         a[i]=rand()%1000;
       }    
 printf("List Before Sorting...\n");
// printArray(a, N);

if (N<=THR2)
{
 clock_t begin = clock();
 smergeSort(a, 0, N - 1, THR2);
 clock_t end = clock(); 
 printf("\nSorted array:  ");
 //printArray(a,N);
 printf("\n");
 test(a,N);
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("SM");
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d",N);

 exit(0);
}      

else 
{
 clock_t begin = clock();
 mergesort(a, N, threadsPerBlock, blocksPerGrid);
 clock_t end = clock();
 printf("\nSorted array:  ");
 //printArray(a,N);
 printf("\n");
 test(a,N);
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("Cuda");
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d\n",N);

 exit(0);
}      
}

Cuda cudaMemcpyDeviceToHost给出了错误

1 个答案: