我正在编写合并排序的cuda版本,如果我使用cudaMemcpyDeviceToHost来获取GPU中的元素列表,它会给出内存错误,另一方面如果我正在注释掉该行,则程序不会排序正常。任何人都可以建议。
/* C program for Merge Sort with Cuda Technology*/
#include<stdlib.h>
#include<stdio.h>
#include <cuda.h>
#include <sys/time.h>
#define THR1 1000
#define THR2 10000
#define N 800000
/*
********************************
Program UTILITY Code here
********************************
*/
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/* Function to print an array */
void printArray(int A[], int size)
{
int i;
for (i=0; i < size; i++)
printf("%d ", A[i]);
printf("\n");
}
//Function to test if the output is in asending order or not
void test(int a[], int n) {
int i;
for (i=1;i<n;++i) {
if (a[i]<a[i-1]) {
break;
}
}
if (i<n) {
for (i=1;i<n;++i) {
if (a[i]>a[i-1]){
break;
}
}
if (i<n) {
printf("\nArray is not sorted\n");
}
}
else {
printf("\nArray is sorted\n");
}
}
/*
*****************************************
Sequential Version here
*****************************************
*/
void insertionSort(int array[], int min, int max)
{
int key ;
// we loop through all elements in the original array from the min + 1 element
for (int j = min + 1 ; j <= max ; j++)
{
// store the current element as the key
key = array[j] ;
// get the element just before the current element
int i = j - 1 ;
// loop through all elements from the key to the min element
// check if the current element is smaller than the key
while (i >= min && array[i] > key)
{
// we move the current element backward
array[i+1] = array[i] ;
i-- ;
}
// we finally move the key
array[i+1] = key ;
}
}
void merge(int array[], int min, int max, int mid)
{
int firstIndex = min;
int secondIndex = mid + 1;
int * tempArray = new int [max + 1];
// While there are elements in the left or right runs
for (int index = min; index <= max; index++) {
// If left run head exists and is <= existing right run head.
if (firstIndex <= mid && (secondIndex > max || array[firstIndex] <= array[secondIndex]))
{
tempArray[index] = array[firstIndex];
firstIndex = firstIndex + 1;
}
else
{
tempArray[index] = array[secondIndex];
secondIndex = secondIndex + 1;
}
}
// transfer to the initial array
for (int index = min ; index <= max ; index++)
array[index] = tempArray[index];
}
void smergeSort(int array[], int min, int max, int threshold)
{
// prerequisite
if ( (max - min + 1) <= threshold )
{
insertionSort(array, min, max);
}
else
{
// get the middle point
int mid = (max+min) / 2;
// apply merge sort to both parts of this
smergeSort(array, min, mid, threshold);
smergeSort(array, mid+1, max, threshold);
// and finally merge all that sorted stuff
merge(array, min, max, mid) ;
}
}
/*
*****************************************
Parallel Version here
*****************************************
*/
__device__ void gpu_bottomUpMerge(int* source, int* dest, int start, int middle, int end) {
int i = start;
int j = middle;
for (int k = start; k < end; k++) {
if (i < middle && (j >= end || source[i] < source[j])) {
dest[k] = source[i];
i++;
} else {
dest[k] = source[j];
j++;
}
}
}
__global__ void gpu_mergesort(int* source, int* dest, int size, int width, int slices, dim3* threads, dim3* blocks) {
int idx = blockDim .x * blockIdx .x + threadIdx .x;
int start = width*idx*slices,
middle,
end;
for (int slice = 0; slice < slices; slice++) {
if (start >= size)
break;
middle = min(start + (width >> 1), size);
end = min(start + width, size);
gpu_bottomUpMerge(source, dest, start, middle, end);
start += width;
}
}
void mergesort(int* data, int size, dim3 threadsPerBlock, dim3 blocksPerGrid) {
// Allocate two arrays on the GPU we switch back and forth between them during the sort
int* D_data;
int* D_swp;
dim3* D_threads;
dim3* D_blocks;
// Actually allocate the two arrays
HANDLE_ERROR(cudaMalloc((void**) &D_data, size * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &D_swp, size * sizeof(int)));
// Copy from our input list into the first array
HANDLE_ERROR(cudaMemcpy(D_data, data, size * sizeof(int), cudaMemcpyHostToDevice));
int* A = D_data;
int* B = D_swp;
int nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z * blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;
// Divide the list and give pieces of it to each thread, letting the pieces grow bigger and bigger until the whole list is sorted
for (int width = 2; width < (size << 1); width <<= 1) {
int slices = size / ((nThreads) * width) + 1;
// Actually call the kernel
gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
cudaDeviceSynchronize();
// Switch the input / output arrays instead of copying them around
A = A == D_data ? D_swp : D_data;
B = B == D_data ? D_swp : D_data;
}
// Get the list back from the GPU
HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost));
// Free the GPU memory
HANDLE_ERROR(cudaFree(A));
HANDLE_ERROR(cudaFree(B));
}
/* Driver program to test above functions */
int main()
{
dim3 threadsPerBlock;
dim3 blocksPerGrid;
threadsPerBlock.x = 224;
blocksPerGrid.x = 10;
int i, *a;
printf("How many elements in the array? ");
a = (int *)malloc(sizeof(int) * N);
srand(time(0));
for(i=0;i<N;i++)
{
a[i]=rand()%1000;
}
printf("List Before Sorting...\n");
// printArray(a, N);
if (N<=THR2)
{
clock_t begin = clock();
smergeSort(a, 0, N - 1, THR2);
clock_t end = clock();
printf("\nSorted array: ");
//printArray(a,N);
printf("\n");
test(a,N);
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("SM");
printf("Elapsed: %f seconds\n",time_spent );
printf("\nSize of the array is %d",N);
exit(0);
}
else
{
clock_t begin = clock();
mergesort(a, N, threadsPerBlock, blocksPerGrid);
clock_t end = clock();
printf("\nSorted array: ");
//printArray(a,N);
printf("\n");
test(a,N);
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("Cuda");
printf("Elapsed: %f seconds\n",time_spent );
printf("\nSize of the array is %d\n",N);
exit(0);
}
}
现在程序工作正常,即使对于大型元素,但是,当我使用大量线程时,让我们说块10和线程224;给出错误: - 在mergesort.cu中遇到非法内存访问在第215行
调试代码后,我再次遇到错误: -
========= Invalid __global__ read of size 4
========= at 0x00000148 in
/home/sharmpra/mergesort.cu:150:gpu_mergesort(int*, int*, int, int, int, dim3*, dim3*)
========= by thread (96,0,0) in block (9,0,0)
========= Address 0x915fc0000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./a.out [0x1e831]
========= Host Frame:./a.out [0x3c3d3]
========= Host Frame:./a.out [0x38a8]
========= Host Frame:./a.out [0x37b1]
========= Host Frame:./a.out [0x3810]
========= Host Frame:./a.out [0x33d1]
========= Host Frame:./a.out [0x35ae]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
========= Host Frame:./a.out [0x2bc9]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib64/libcuda.so.1 [0x2ef503]
========= Host Frame:./a.out [0x3c0f6]
========= Host Frame:./a.out [0x33da]
========= Host Frame:./a.out [0x35ae]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
========= Host Frame:./a.out [0x2bc9]
=========
请有人建议我在程序中可以执行哪些其他操作来删除此类错误。此外,我正在使用此命令行设置:-nvcc -o a.out -Wno-deprecated-gpu-targets -lineinfo -arch = compute_20,sm_20 -rdc = true -lcudadevrt mergesort.cu
答案 0 :(得分:1)
正如@Robert所解释的那样,代码是从全局内存中读取source [i]和source [j],每个都是int数量(大小为4个字节),所以我试图避免使用相同的数组进行比较我补充说: - for(int k = start; k&lt; end; k ++) dest [k] = source [k]; 在gpu_bottomUpMerge中,通过在我的代码中添加这一行,它可以处理更多的块和线程但仍然为大量元素提供非法内存错误,因此,为了解决该问题,我使用了指针而不是int,我使用了很长时间。 以下是该计划的更新版本:
/* C program for Merge Sort with Cuda Technology*/
#include<stdlib.h>
#include<stdio.h>
#include <cuda.h>
#include <sys/time.h>
#define THR1 1000
#define THR2 10000
#define N 800000
/*
********************************
Program UTILITY Code here
********************************
*/
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/* Function to print an array */
void printArray(int A[], int size)
{
int i;
for (i=0; i < size; i++)
printf("%d ", A[i]);
printf("\n");
}
//Function to test if the output is in ascending order or not
void test(int a[], int n) {
int i;
for (i=1;i<n;++i) {
if (a[i]<a[i-1]) {
break;
}
}
if (i<n) {
for (i=1;i<n;++i) {
if (a[i]>a[i-1]){
break;
}
}
if (i<n) {
printf("\nArray is not sorted\n");
}
}
else {
printf("\nArray is sorted\n");
}
}
/*
*****************************************
Sequential Version here
*****************************************
*/
void insertionSort(int array[], int min, int max)
{
int key ;
// we loop through all elements in the original array from the min + 1 element
for (int j = min + 1 ; j <= max ; j++)
{
// store the current element as the key
key = array[j] ;
// get the element just before the current element
int i = j - 1 ;
// loop through all elements from the key to the min element
// check if the current element is smaller than the key
while (i >= min && array[i] > key)
{
// we move the current element backward
array[i+1] = array[i] ;
i-- ;
}
// we finally move the key
array[i+1] = key ;
}
}
void merge(int array[], int min, int max, int mid)
{
int firstIndex = min;
int secondIndex = mid + 1;
int * tempArray = new int [max + 1];
// While there are elements in the left or right runs
for (int index = min; index <= max; index++) {
// If left run head exists and is <= existing right run head.
if (firstIndex <= mid && (secondIndex > max || array[firstIndex] <= array[secondIndex]))
{
tempArray[index] = array[firstIndex];
firstIndex = firstIndex + 1;
}
else
{
tempArray[index] = array[secondIndex];
secondIndex = secondIndex + 1;
}
}
// transfer to the initial array
for (int index = min ; index <= max ; index++)
array[index] = tempArray[index];
}
void smergeSort(int array[], int min, int max, int threshold)
{
// prerequisite
if ( (max - min + 1) <= threshold )
{
insertionSort(array, min, max);
}
else
{
// get the middle point
int mid = (max+min) / 2;
// apply merge sort to both parts of this
smergeSort(array, min, mid, threshold);
smergeSort(array, mid+1, max, threshold);
// and finally merge all that sorted stuff
merge(array, min, max, mid) ;
}
}
/*
*****************************************
Parallel Version here
*****************************************
*/
__device__ void gpu_bottomUpMerge(int* source, int* dest, int start, int middle, int end) {
int i = start;
int j = middle;
for (int k = start; k < end; k++)
dest[k] = source[k];
for (int k = start; k < end; k++) {
if (i < middle && (j >= end || source[i] < dest[j])) {
dest[k] = source[i];
i++;
} else {
dest[k] = source[j];
j++;
}
}
}
__global__ void gpu_mergesort(int* source, int* dest, int size, int width, int slices, dim3* threads, dim3* blocks) {
int idx = blockDim .x * blockIdx .x + threadIdx .x;
int start = width*idx*slices,
middle,
end;
for (int slice = 0; slice < slices; slice++) {
if (start >= size)
break;
middle = min(start + (width >> 1), size);
end = min(start + width, size);
gpu_bottomUpMerge(source, dest, start, middle, end);
start += width;
}
}
void mergesort(int* data, int size, dim3 threadsPerBlock, dim3 blocksPerGrid) {
// Allocate two arrays on the GPU we switch back and forth between them during the sort
int* D_data;
int* D_swp;
dim3* D_threads;
dim3* D_blocks;
// Actually allocate the two arrays
HANDLE_ERROR(cudaMalloc((void**) &D_data, size * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &D_swp, size * sizeof(int)));
// Copy from our input list into the first array
HANDLE_ERROR(cudaMemcpy(D_data, data, size * sizeof(int), cudaMemcpyHostToDevice));
int* A = D_data;
int* B = D_swp;
int nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z * blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;
// Divide the list and give pieces of it to each thread, letting the pieces grow bigger and bigger until the whole list is sorted
for (int width = 2; width < (size << 1); width <<= 1) {
int slices = size / ((nThreads) * width) + 1;
// Actually call the kernel
gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
cudaDeviceSynchronize();
// Switch the input / output arrays instead of copying them around
A = A == D_data ? D_swp : D_data;
B = B == D_data ? D_swp : D_data;
}
// Get the list back from the GPU
HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost));
// Free the GPU memory
HANDLE_ERROR(cudaFree(A));
HANDLE_ERROR(cudaFree(B));
}
/* Driver program to test above functions */
int main()
{
dim3 threadsPerBlock;
dim3 blocksPerGrid;
threadsPerBlock.x = 122;
blocksPerGrid.x = 1;
int i, *a;
printf("How many elements in the array? ");
a = (int *)malloc(sizeof(int) * N);
srand(time(0));
for(i=0;i<N;i++)
{
a[i]=rand()%1000;
}
printf("List Before Sorting...\n");
// printArray(a, N);
if (N<=THR2)
{
clock_t begin = clock();
smergeSort(a, 0, N - 1, THR2);
clock_t end = clock();
printf("\nSorted array: ");
//printArray(a,N);
printf("\n");
test(a,N);
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("SM");
printf("Elapsed: %f seconds\n",time_spent );
printf("\nSize of the array is %d",N);
exit(0);
}
else
{
clock_t begin = clock();
mergesort(a, N, threadsPerBlock, blocksPerGrid);
clock_t end = clock();
printf("\nSorted array: ");
//printArray(a,N);
printf("\n");
test(a,N);
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("Cuda");
printf("Elapsed: %f seconds\n",time_spent );
printf("\nSize of the array is %d\n",N);
exit(0);
}
}