使用cuda thrust :: max_element查找数组中的max元素有时会返回错误

时间:2014-12-01 21:56:18

标签: c arrays cuda max thrust

我在设备上填充了一个2 ^ 20元素阵列;这些数字每次都应该是相同的。 然后我将该数组移到主机上然后搜索数组中的max元素,这种技术适用于2 ^ 10元素数组但是一旦我开始得到任何大于我开始得到随机答案我不确定推力是否搞乱或设备计算。

答案max_element应该返回0.094479通常在程序第一次运行时代码会输出正确答案然后答案将每隔几次随机显示

GPU是tesla k20运行5.0也在780GTX上测试过;两次同样的问题

//Host Code
int main( void ) {
    float h_c[TOTAL];
    float *d_c;

    cudaMalloc((void**)&d_c, sizeof(float)*TOTAL);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    //Number of threads
    kernel<<<BLOCKS,THREADS>>>(d_c);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float mil = 0;
    cudaEventElapsedTime(&mil, start, stop);

    cudaMemcpy(h_c, d_c, sizeof(float)*TOTAL, cudaMemcpyDeviceToHost);

    for(int y = 0; y < TOTAL; y++){
        printf(" %d: Host C: %f \n",y, h_c[y]);
    }
    float *result = thrust::max_element(h_c, h_c + TOTAL);
    printf("Max is: %f \n", *result);
    printf("Time:  %f \n", mil/1000);
    printf("THREADS:  %d \n", THREADS);
    printf("BLOCKS:  %d \n", BLOCKS);
    printf("TOTAL:  %d \n", TOTAL);
    cudaFree(d_c);
    cudaDeviceReset() ;
    return 0;
}

设备代码

#include <thrust/extrema.h>
#include <math.h>
#include <stdio.h>

#define ARRAYSIZE 15
#define THREADS 1024
#define BLOCKS 32
#define TOTAL THREADS * BLOCKS 

__global__ void kernel(float *cc){

//Get thread for summing all elements 
int threadId = threadIdx.x + blockDim.x * blockIdx.x;

int decimalNumber,quotient;
//Size of the array
//const int size = 10;
//Holds the binary number in an array
int binaryNumber[ARRAYSIZE];
int i = 0;


int a[ARRAYSIZE] = {1192, 1315, 1462, 1484, 1476, 1443, 1508, 1489, 1470, 1573, 1633, 1539, 1600, 1707, 1701};//, 1682, 1688, 1681, 1694, 1728};
int b[ARRAYSIZE] = {1162, 1337, 1282, 1491, 1508, 1517, 1488, 1513, 1539, 1576, 1626 ,1634, 1573,    1786, 1741};//, 1782, 1755, 1669, 1700, 1826};

//Holds Product from the dot product
int c[ARRAYSIZE];
//Arrays to hold integers to be summed 
int aSumArr[ARRAYSIZE];
int bSumArr[ARRAYSIZE];

for(int i = 0; i < ARRAYSIZE; i++){
    c[i] = 0;
    aSumArr[i] = 0;
    bSumArr[i] = 0;
}

//Holds the value for the dot product
int dotSum = 0;
//Holds sum of valid array positions for array a
int aSum = 0;
//Holds sum of valid array positions for array b
int bSum = 0;

//Holds the Value of the arcCos of the dot product / sqrt(array a) * sqrt(array b)
float finalValue = 0;
//printf("ThreadID: %d \n", threadId);
//ALL 1's 1048575 = Threads
decimalNumber = threadId;
//printf("decimal number:  %d \n", decimalNumber); 

quotient = decimalNumber;
//Loop to convert decimal into binary and store in array
while(quotient!=0){

    binaryNumber[i++]= quotient % 2;

    quotient = quotient / 2;

}

//Test if conversion from decimal to binary is complete and correct
//printf("Equivalent binary value of decimal number %d: \n",decimalNumber);

//for(int in = size-1; in >= 0;in--){
  //printf("Index: %d | binary number:  %d ----  a:%d || b: %d\n",in,binaryNumber[in],a[in],b[in]);
//}
//printf(" \n ");

//Loop through binaryNumber array
for(int x = ARRAYSIZE-1 ; x >= 0; x--){
    //If index is == 1 Perform calculation
    if(binaryNumber[x] == 1){
        //Multiply numbers at good index
        c[x] = a[x] * b[x];
        //Fill sum arrays at correct index
        aSumArr[x] = a[x];
        bSumArr[x] = b[x];

        //Checks if the loop is executing correctly
        //sumArray[x] = 1;
        //printf("Multiplied - %d * %d = %f\n", a[x], b[x], c[x]);
        //printf("--This should not be run --\n");
    }else{
//          printf("SKIPPED - %d * %d = %f\n", a[x], b[x], c[x]);
    }


}

//Sums up the product array to complete dot product
for(int j = 0; j < ARRAYSIZE; ++j){
    dotSum += c[j];
    //printf("aSumArr %d \n", aSumArr[j]);
    //printf("bSumArr %d \n", bSumArr[j]);
    aSum += powf( aSumArr[j], 2 );
    bSum += powf( bSumArr[j], 2 );
//      printf("aSum: %d +  aSumArr %d \n", aSum, aSumArr[j]);
//      printf("bSum: %d +  bSumArr %d \n", bSum, bSumArr[j]);
}


//printf("\n");
//Print out the dot prudct
//printf("Dot product is: %d \n", dotSum);
//printf("aSum is: %d \n", aSum);
//printf("bSum is: %d \n", bSum);

float sqSum1 = sqrtf(aSum);
float sqSum2 = sqrtf(bSum);
// printf("sqSum1: %f \n", sqSum1);
// printf("sqSum2: %f \n", sqSum2);
float sqSum = sqSum1 * sqSum2;
// printf("sqSum %f \n", sqSum);     
float div = dotSum / sqSum ;
// printf("div: %f \n", div);
finalValue = acosf( div ) ;

//Stores the threads final value in array cc, in the respected index
if(finalValue == finalValue){
    cc[threadId] = finalValue;
}else{
    cc[threadId] = -2;
}
//printf("final value is: %f for number %d \n", finalValue, threadId);
}

1 个答案:

答案 0 :(得分:1)

似乎是使用不正确的初始化/未初始化变量的情况。

我添加以下行后:

for(int i = 0; i < ARRAYSIZE; i++){
    c[i] = 0;
    aSumArr[i] = 0;
    bSumArr[i] = 0;
    binaryNumber[i] = 0; // add this line
}

我无法重现这个问题。