Cuda cudaMemcpyDeviceToHost给出了错误

时间:2017-05-17 14:04:09

标签: sorting cuda


/* C program for Merge Sort with Cuda Technology*/
 #include <cuda.h>
 #include <sys/time.h>

 #define THR1 1000
 #define THR2 10000

 #define N 800000

  Program UTILITY Code here

 static void HandleError( cudaError_t err, const char *file, int line ) {
 if (err != cudaSuccess) {
 printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
 exit( EXIT_FAILURE );

 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

 /* Function to print an array */
 void printArray(int A[], int size)
 int i;
 for (i=0; i < size; i++)
    printf("%d ", A[i]);

 //Function to test if the output is in asending order or not
 void test(int a[], int n) {
  int i;
 for (i=1;i<n;++i) {
 if (a[i]<a[i-1]) {
if (i<n) { 
 for (i=1;i<n;++i) {  
   if (a[i]>a[i-1]){
if (i<n) {
  printf("\nArray is not sorted\n");
 else {
 printf("\nArray is sorted\n");
 Sequential Version here

 void insertionSort(int array[], int min, int max)
int key ;
// we loop through all elements in the original array from the min + 1 element
for (int j = min + 1 ; j <= max ; j++)
    // store the current element as the key
    key = array[j] ;
    // get the element just before the current element
    int i = j - 1 ;
    // loop through all elements from the key to the min element
    // check if the current element is smaller than the key
    while (i >= min && array[i] > key)
        // we move the current element backward
        array[i+1] = array[i] ;
        i-- ;
    // we finally move the key
    array[i+1] = key ;

 void merge(int array[], int min, int max, int mid)
int firstIndex = min;
int secondIndex = mid + 1;
int * tempArray = new int [max + 1];

// While there are elements in the left or right runs
for (int index = min; index <= max; index++) {
    // If left run head exists and is <= existing right run head.
    if (firstIndex <= mid && (secondIndex > max || array[firstIndex] <= array[secondIndex]))
        tempArray[index] = array[firstIndex];
        firstIndex = firstIndex + 1;

        tempArray[index] = array[secondIndex];
        secondIndex = secondIndex + 1;


// transfer to the initial array
for (int index = min ; index <= max ; index++)
    array[index] = tempArray[index];

 void smergeSort(int array[], int min, int max, int threshold)
// prerequisite
if ( (max - min + 1) <= threshold )
    insertionSort(array, min, max);
    // get the middle point
    int mid = (max+min) / 2;

    // apply merge sort to both parts of this
    smergeSort(array, min, mid, threshold);
    smergeSort(array, mid+1, max, threshold);

    // and finally merge all that sorted stuff
    merge(array, min, max, mid) ;

 Parallel Version here
 __device__ void gpu_bottomUpMerge(int* source, int* dest, int start, int middle, int end) {
int i = start;
int j = middle;
for (int k = start; k < end; k++) {
    if (i < middle && (j >= end || source[i] < source[j])) {
        dest[k] = source[i];
    } else {
        dest[k] = source[j];

__global__ void gpu_mergesort(int* source, int* dest, int size, int width, int slices, dim3* threads, dim3* blocks) {

int idx = blockDim .x * blockIdx .x + threadIdx .x;

int start = width*idx*slices, 

for (int slice = 0; slice < slices; slice++) {
    if (start >= size)

    middle = min(start + (width >> 1), size);
    end = min(start + width, size);
    gpu_bottomUpMerge(source, dest, start, middle, end);
    start += width;

void mergesort(int* data, int size, dim3 threadsPerBlock, dim3 blocksPerGrid) {

// Allocate two arrays on the GPU we switch back and forth between them during the sort

int* D_data;
int* D_swp;
dim3* D_threads;
dim3* D_blocks;

// Actually allocate the two arrays

HANDLE_ERROR(cudaMalloc((void**) &D_data, size * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &D_swp, size * sizeof(int)));

// Copy from our input list into the first array
HANDLE_ERROR(cudaMemcpy(D_data, data, size * sizeof(int), cudaMemcpyHostToDevice));  

int* A = D_data;
int* B = D_swp;

int nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z * blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;

// Divide the list and give pieces of it to each thread, letting the pieces grow bigger and bigger until the whole list is sorted
for (int width = 2; width < (size << 1); width <<= 1) {
    int slices = size / ((nThreads) * width) + 1;

    // Actually call the kernel
    gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);

    // Switch the input / output arrays instead of copying them around
    A = A == D_data ? D_swp : D_data;
    B = B == D_data ? D_swp : D_data;

// Get the list back from the GPU 
HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost));

// Free the GPU memory


 /* Driver program to test above functions */
 int main()

dim3 threadsPerBlock;
dim3 blocksPerGrid;

threadsPerBlock.x = 224;
blocksPerGrid.x = 10; 

int i, *a;

 printf("How many elements in the array? ");

 a = (int *)malloc(sizeof(int) * N);        
 printf("List Before Sorting...\n");
// printArray(a, N);

if (N<=THR2)
 clock_t begin = clock();
 smergeSort(a, 0, N - 1, THR2);
 clock_t end = clock(); 
 printf("\nSorted array:  ");
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d",N);


 clock_t begin = clock();
 mergesort(a, N, threadsPerBlock, blocksPerGrid);
 clock_t end = clock();
 printf("\nSorted array:  ");
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d\n",N);


现在程序工作正常,即使对于大型元素,但是,当我使用大量线程时,让我们说块10和线程224;给出错误: - 在mergesort.cu中遇到非法内存访问在第215行

调试代码后,我再次遇到错误: -

========= Invalid __global__ read of size 4
=========     at 0x00000148 in 
/home/sharmpra/*, int*, int, int, int, dim3*, dim3*)
=========     by thread (96,0,0) in block (9,0,0)
=========     Address 0x915fc0000 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib64/ (cuLaunchKernel + 0x2c5) [0x204235]
=========     Host Frame:./a.out [0x1e831]
=========     Host Frame:./a.out [0x3c3d3]
=========     Host Frame:./a.out [0x38a8]
=========     Host Frame:./a.out [0x37b1]
=========     Host Frame:./a.out [0x3810]
=========     Host Frame:./a.out [0x33d1]
=========     Host Frame:./a.out [0x35ae]
=========     Host Frame:/lib64/ (__libc_start_main + 0xf0) [0x20790]
=========     Host Frame:./a.out [0x2bc9]
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/ [0x2ef503]
=========     Host Frame:./a.out [0x3c0f6]
=========     Host Frame:./a.out [0x33da]
=========     Host Frame:./a.out [0x35ae]
=========     Host Frame:/lib64/ (__libc_start_main + 0xf0) [0x20790]
=========     Host Frame:./a.out [0x2bc9]

请有人建议我在程序中可以执行哪些其他操作来删除此类错误。此外,我正在使用此命令行设置:-nvcc -o a.out -Wno-deprecated-gpu-targets -lineinfo -arch = compute_20,sm_20 -rdc = true -lcudadevrt

正如@Robert所解释的那样,代码是从全局内存中读取source [i]和source [j],每个都是int数量(大小为4个字节),所以我试图避免使用相同的数组进行比较我补充说: - for(int k = start; k&lt; end; k ++)     dest [k] = source [k]; 在gpu_bottomUpMerge中,通过在我的代码中添加这一行,它可以处理更多的块和线程但仍然为大量元素提供非法内存错误,因此,为了解决该问题,我使用了指针而不是int,我使用了很长时间。 以下是该计划的更新版本:

