OpenAcc Merge排序程序

时间:2017-04-19 19:15:02

标签: c openacc

#include<stdlib.h>
#include<stdio.h>
#include <time.h>
#include <omp.h>
#include <openacc.h>
#define THR 10


//Function to test if the output is in asending order or not
void test(int a[], int n) {
int i;
for (i=1;i<n;++i) {
if (a[i]<a[i-1]) {
  break;
}
}
if (i<n) { 
for (i=1;i<n;++i) {  
   if (a[i]>a[i-1]){
      break;
   }
 }
if (i<n) {
  printf("\nArray is not sorted\n");
}
}
else {
printf("\nArray is sorted\n");
}
}

 /* Function to sort an array using insertion sort in serial*/
void isort (int *array, int low, int mid, int high) {

for (int i = mid; i <= high; i++) {
    for (int j = i - 1; j >= 0; j--) {
        if (array[i] < array [j]) {
            int holder = array[j];
            array[j] = array[i];
            array[i] = holder;
            i--;
        }
    }
 }
 }
/* Function to merge */
void merge(int arr[], int l, int m, int r);

// Utility function to find minimum of two integers
#pragma acc routine seq
int min(int x, int y) { return (x<y)? x :y; }

/* Iterative mergesort function to sort arr[0...n-1] */
void mergeSort(int arr[], int n)
{
int curr_size;  // For current size of subarrays to be merged
               // curr_size varies from 1 to n/2
int left_start; // For picking starting index of left subarray
               // to be merged
#pragma acc kernels //pcopy(arr[0:n1])// pcopying (R[0:n2])
      { 
 #pragma acc loop independent
  for (curr_size=1; curr_size<=n-1; curr_size = 2*curr_size)
 {
  #pragma acc loop independent
   // Pick starting point of different subarrays of current size
   for (left_start=0; left_start<n-1; left_start += 2*curr_size)
   {
       // Find ending point of left subarray. mid+1 is starting 
       // point of right
       int mid = left_start + curr_size - 1;

       int right_end = min(left_start + 2*curr_size - 1, n-1);

       // Merge Subarrays arr[left_start...mid] & arr[mid+1...right_end]
       merge(arr, left_start, mid, right_end);
   }
 }
}}

/* Function to merge the two haves arr[l..m] and arr[m+1..r] of array arr[] 
*/
 #pragma acc routine vector
 void merge(int arr[], int l, int m, int r)
{
int i, j, k;
int n1 = m - l + 1;
int n2 =  r - m;

/* create temp arrays */
int *L, *R;
 L = (int *)malloc(sizeof(int) * n1); 
 R = (int *)malloc(sizeof(int) * n2);     
/* Copy data to temp arrays L[] and R[] */
 #pragma acc loop independent 
for (i = 0; i < n1; i++)
    L[i] = arr[l + i];
 #pragma acc loop independent 
for (j = 0; j < n2; j++)
    R[j] = arr[m + 1+ j];

/* Merge the temp arrays back into arr[l..r]*/
i = 0;
j = 0;
k = l;

#pragma acc loop independent private(k) reduction(+:j) private (L[0:n1]) private (R[0:n1]) private (arr[0:n1])reduction(+:i)
      //  private(k) reduction(+:j)
while (i < n1 && j < n2)
{
    if (L[i] <= R[j])
    {
        arr[k] = L[i];
        i++;
    }
    else
    {
        arr[k] = R[j];
        j++;
    }
    k++;
}

/* Copy the remaining elements of L[], if there are any */

#pragma acc loop seq private (arr[0:n2]) 
while (i < n1)
{
    arr[k] = L[i];
    i++;
    k++;
}

/* Copy the remaining elements of R[], if there are any */
#pragma acc loop seq private (arr[0:n2]) 
while (j < n2)
{
    arr[k] = R[j];
    j++;
    k++;
}
free(L);
free(R);
}


/* Function to print an array */
void printArray(int A[], int size)
{
int i;
for (i=0; i < size; i++)
    printf("%d ", A[i]);
printf("\n");
}

/* Driver program to test above functions */
int main()
 {
 int i, n=20, *a;
 double startTime, endTime;

 printf("How many elements in the array? ");

 a = (int *)malloc(sizeof(int) * n);        
 srand(time(0));
 for(i=0;i<n;i++)
       {
         a[i]=rand()%1000;
       }    
 printf("List Before Sorting...\n");
 printArray(a, n);
 if (n<=THR) 
 {
 startTime = omp_get_wtime();
 isort(a,0,0,n); 
 endTime = omp_get_wtime();
 printf("\nSorted array:  ");
 printArray(a,n);
 printf("\n");
 test(a,n);
 printf("IN");
 printf("\nTime: %g\n",endTime-startTime);
 exit(0);
 }       

 else 
 {
 startTime = omp_get_wtime();
 mergeSort(a,n); 
 endTime = omp_get_wtime();
 printf("\nSorted array:  ");
 printArray(a,n);
 printf("\n");
 test(a,n);
 printf("ACC");
 printf("\nTime: %g\n",endTime-startTime);
 printf("\nSize of the array is %d",n);

 exit(0);
 }      
 }

在阅读PGI论坛上的大量文章和来自stackoverflow的@MAT帮助后,我设法解决了大部分错误,现在我使用迭代Merge Sort而不是递归,因为我发现OpenAcc不能很好地处理递归功能。现在,我只有一个错误,这是在合并功能: -

109,没有整数跳闸计数的循环将以顺序模式执行

我也阅读http://www.pgroup.com/userforum/viewtopic.php?p=17748&sid=0aa1537bdc68fc1f6fac74f66a788970,但无法确定在我的程序中使用它。由于这个错误,我的结果是: -

数组中有多少个元素?排序前列表... 801 673 288 374 516 908 473 130 874 928 491 406 276 302 186 442 865 341 624 725 浮点异常

即使我在程序中使用整数,它也会说浮点异常。

1 个答案:

答案 0 :(得分:0)

这里有一些问题。

看起来可能存在&#34; mid&#34;更大的是&#34; right_end&#34;这将导致malloc具有负大小。虽然我不确定它是否是正确的修复方法,但我在中间&gt; right_end时添加了一条if语句来跳过合并。

并行循环必须是可数的,即迭代次数在循环开始时是已知的。因此,你不能把#&#34;循环&#34; while循环的指令。

推杆&#34; arr&#34;私人条款并不是你想要的。您想要更新全局数组,而不是私有副本。 Private将为每个gang,vector或worker创建变量的私有副本(取决于具有private子句的循环的调度)。内核执行后,私有变量消失了。

你的&#34; curr_size&#34; for循环具有依赖性,因此无法并行化。这里的问题是它大步走过&#34; curr_size = 2 * curr_size&#34;。每次迭代都需要先前迭代的curr_size值才能计算出自己的curr_size。

请注意使用&#34; malloc&#34;在计算区域内是有问题的。首先,设备端mallocs非常慢,因此会对性能产生负面影响。此外,设备上的堆非常小(默认为8MB但可以增加但设置环境变量PGI_ACC_HEAPSIZE)因此非常大的n值可能会溢出堆并产生非法地址错误。

这里是固定代码:

% cat test.c
#include<stdlib.h>
#include<stdio.h>
#include <time.h>
#include <omp.h>
#include <openacc.h>
#define THR 10


//Function to test if the output is in asending order or not
void test(int a[], int n) {
int i;
for (i=1;i<n;++i) {
if (a[i]<a[i-1]) {
  break;
}
}
if (i<n) {
for (i=1;i<n;++i) {
   if (a[i]>a[i-1]){
      break;
   }
 }
if (i<n) {
  printf("\nArray is not sorted\n");
}
}
else {
printf("\nArray is sorted\n");
}
}

 /* Function to sort an array using insertion sort in serial*/
void isort (int *array, int low, int mid, int high) {

for (int i = mid; i <= high; i++) {
    for (int j = i - 1; j >= 0; j--) {
        if (array[i] < array [j]) {
            int holder = array[j];
            array[j] = array[i];
            array[i] = holder;
            i--;
        }
    }
 }
 }
/* Function to merge */
void merge(int arr[], int l, int m, int r);

// Utility function to find minimum of two integers
#pragma acc routine seq
int min(int x, int y) { return (x<y)? x :y; }

/* Iterative mergesort function to sort arr[0...n-1] */
void mergeSort(int arr[], int n)
{
int curr_size;  // For current size of subarrays to be merged
               // curr_size varies from 1 to n/2
int left_start; // For picking starting index of left subarray
               // to be merged
#pragma acc data copy(arr[0:n])// pcopying (R[0:n2])
      {
  for (curr_size=1; curr_size<=n-1; curr_size = 2*curr_size)
 {
  #pragma acc parallel loop
   // Pick starting point of different subarrays of current size
   for (left_start=0; left_start<n-1; left_start += 2*curr_size)
   {
       // Find ending point of left subarray. mid+1 is starting
       // point of right
       int mid = left_start + curr_size - 1;

       int right_end = min(left_start + 2*curr_size - 1, n-1);

       // Merge Subarrays arr[left_start...mid] & arr[mid+1...right_end]
       if (mid < right_end) merge(arr, left_start, mid, right_end);
   }
 }
}}

/* Function to merge the two haves arr[l..m] and arr[m+1..r] of array arr[]
*/
 #pragma acc routine(merge) vector
 void merge(int arr[], int l, int m, int r)
{
int i, j, k;
int n1 = m - l + 1;
int n2 =  r - m;

/* create temp arrays */
int *L, *R;
 L = (int *)malloc(sizeof(int) * n1);
 R = (int *)malloc(sizeof(int) * n2);
/* Copy data to temp arrays L[] and R[] */
 #pragma acc loop independent
for (i = 0; i < n1; i++)
    L[i] = arr[l + i];
 #pragma acc loop independent
for (j = 0; j < n2; j++)
    R[j] = arr[m + 1+ j];

/* Merge the temp arrays back into arr[l..r]*/
i = 0;
j = 0;
k = l;

while (i < n1 && j < n2)
{
    if (L[i] <= R[j])
    {
        arr[k] = L[i];
        i++;
    }
    else
    {
        arr[k] = R[j];
        j++;
    }
    k++;
}

/* Copy the remaining elements of L[], if there are any */

while (i < n1)
{
    arr[k] = L[i];
    i++;
    k++;
}

/* Copy the remaining elements of R[], if there are any */
while (j < n2)
{
    arr[k] = R[j];
    j++;
    k++;
}
free(L);
free(R);
}


/* Function to print an array */
void printArray(int A[], int size)
{
int i;
for (i=0; i < size; i++)
    printf("%d ", A[i]);
printf("\n");
}

/* Driver program to test above functions */
int main()
 {
 int i, n=100, *a;
 double startTime, endTime;

 printf("How many elements in the array? ");

 a = (int *)malloc(sizeof(int) * n);
 srand(time(0));
 for(i=0;i<n;i++)
       {
         a[i]=rand()%1000;
       }
 printf("List Before Sorting...\n");
 printArray(a, n);
 if (n<=THR)
 {
 startTime = omp_get_wtime();
 isort(a,0,0,n);
 endTime = omp_get_wtime();
 printf("\nSorted array:  ");
 printArray(a,n);
 printf("\n");
 test(a,n);
 printf("IN");
 printf("\nTime: %g\n",endTime-startTime);
 exit(0);
 }

 else
 {
 startTime = omp_get_wtime();
 mergeSort(a,n);
 endTime = omp_get_wtime();
 printf("\nSorted array:  ");
 printArray(a,n);
 printf("\n");
 test(a,n);
 printf("ACC");
 printf("\nTime: %g\n",endTime-startTime);
 printf("\nSize of the array is %d\n",n);

 exit(0);
 }
 }

% pgcc -Minfo=accel -acc test.c -fast -o gpu
min:
     51, Generating acc routine seq
         Generating Tesla code
mergeSort:
     60, Generating copy(arr[:n])
     64, Accelerator kernel generated
         Generating Tesla code
         66, #pragma acc loop gang /* blockIdx.x */
merge:
     84, Generating Tesla code
         95, #pragma acc loop vector /* threadIdx.x */
         98, #pragma acc loop vector /* threadIdx.x */
        106, #pragma acc loop seq
        123, #pragma acc loop seq
        131, #pragma acc loop seq
     95, Loop is parallelizable
     98, Loop is parallelizable
    106, Loop carried scalar dependence for i at line 119,126,125
         Loop carried scalar dependence for j at line 106,108
         Loop carried scalar dependence for i at line 111,110
         Loop carried scalar dependence for j at line 116
         Scalar last value needed after loop for i at line 123
         Loop carried scalar dependence for j at line 134,133
         Loop carried scalar dependence for i at line 108
         Loop carried scalar dependence for j at line 115
    110, Accelerator restriction: induction variable live-out from loop: k
    115, Accelerator restriction: induction variable live-out from loop: k
    118, Accelerator restriction: induction variable live-out from loop: k
% ./gpu
How many elements in the array? List Before Sorting...
58 612 99 182 914 802 452 229 856 644 932 339 650 915 533 300 456 892 860 764 866 229 400 561 328 170 573 121 364 392 801 775 356 901 309 622 703 762 851 911 406 135 602 56 51 136 708 507 380 568 623 246 797 24 160 125 194 733 599 910 477 400 685 185 653 995 808 709 109 659 972 867 147 575 276 198 711 984 57 91 553 680 689 702 56 849 828 602 935 779 865 412 179 550 598 185 897 758 894 6

Sorted array:  6 24 51 56 56 57 58 91 99 109 121 125 135 136 147 160 170 179 182 185 185 194 198 229 229 246 276 300 309 328 339 356 364 380 392 400 400 406 412 452 456 477 507 533 550 553 561 568 573 575 598 599 602 602 612 622 623 644 650 653 659 680 685 689 702 703 708 709 711 733 758 762 764 775 779 797 801 802 808 828 849 851 856 860 865 866 867 892 894 897 901 910 911 914 915 932 935 972 984 995


Array is sorted
ACC
Time: 1.26239

Size of the array is 100