我在设备上填充了一个2 ^ 20元素阵列;这些数字每次都应该是相同的。 然后我将该数组移到主机上然后搜索数组中的max元素,这种技术适用于2 ^ 10元素数组但是一旦我开始得到任何大于我开始得到随机答案我不确定推力是否搞乱或设备计算。
GPU是tesla k20运行5.0也在780GTX上测试过;两次同样的问题
//Host Code
int main( void ) {
float h_c[TOTAL];
float *d_c;
cudaMalloc((void**)&d_c, sizeof(float)*TOTAL);
cudaEvent_t start, stop;
//Number of threads
float mil = 0;
cudaEventElapsedTime(&mil, start, stop);
cudaMemcpy(h_c, d_c, sizeof(float)*TOTAL, cudaMemcpyDeviceToHost);
for(int y = 0; y < TOTAL; y++){
printf(" %d: Host C: %f \n",y, h_c[y]);
float *result = thrust::max_element(h_c, h_c + TOTAL);
printf("Max is: %f \n", *result);
printf("Time: %f \n", mil/1000);
printf("THREADS: %d \n", THREADS);
printf("BLOCKS: %d \n", BLOCKS);
printf("TOTAL: %d \n", TOTAL);
cudaDeviceReset() ;
return 0;
#include <thrust/extrema.h>
#include <math.h>
#include <stdio.h>
#define ARRAYSIZE 15
#define THREADS 1024
#define BLOCKS 32
__global__ void kernel(float *cc){
//Get thread for summing all elements
int threadId = threadIdx.x + blockDim.x * blockIdx.x;
int decimalNumber,quotient;
//Size of the array
//const int size = 10;
//Holds the binary number in an array
int binaryNumber[ARRAYSIZE];
int i = 0;
int a[ARRAYSIZE] = {1192, 1315, 1462, 1484, 1476, 1443, 1508, 1489, 1470, 1573, 1633, 1539, 1600, 1707, 1701};//, 1682, 1688, 1681, 1694, 1728};
int b[ARRAYSIZE] = {1162, 1337, 1282, 1491, 1508, 1517, 1488, 1513, 1539, 1576, 1626 ,1634, 1573, 1786, 1741};//, 1782, 1755, 1669, 1700, 1826};
//Holds Product from the dot product
//Arrays to hold integers to be summed
int aSumArr[ARRAYSIZE];
int bSumArr[ARRAYSIZE];
for(int i = 0; i < ARRAYSIZE; i++){
c[i] = 0;
aSumArr[i] = 0;
bSumArr[i] = 0;
//Holds the value for the dot product
int dotSum = 0;
//Holds sum of valid array positions for array a
int aSum = 0;
//Holds sum of valid array positions for array b
int bSum = 0;
//Holds the Value of the arcCos of the dot product / sqrt(array a) * sqrt(array b)
float finalValue = 0;
//printf("ThreadID: %d \n", threadId);
//ALL 1's 1048575 = Threads
decimalNumber = threadId;
//printf("decimal number: %d \n", decimalNumber);
quotient = decimalNumber;
//Loop to convert decimal into binary and store in array
binaryNumber[i++]= quotient % 2;
quotient = quotient / 2;
//Test if conversion from decimal to binary is complete and correct
//printf("Equivalent binary value of decimal number %d: \n",decimalNumber);
//for(int in = size-1; in >= 0;in--){
//printf("Index: %d | binary number: %d ---- a:%d || b: %d\n",in,binaryNumber[in],a[in],b[in]);
//printf(" \n ");
//Loop through binaryNumber array
for(int x = ARRAYSIZE-1 ; x >= 0; x--){
//If index is == 1 Perform calculation
if(binaryNumber[x] == 1){
//Multiply numbers at good index
c[x] = a[x] * b[x];
//Fill sum arrays at correct index
aSumArr[x] = a[x];
bSumArr[x] = b[x];
//Checks if the loop is executing correctly
//sumArray[x] = 1;
//printf("Multiplied - %d * %d = %f\n", a[x], b[x], c[x]);
//printf("--This should not be run --\n");
// printf("SKIPPED - %d * %d = %f\n", a[x], b[x], c[x]);
//Sums up the product array to complete dot product
for(int j = 0; j < ARRAYSIZE; ++j){
dotSum += c[j];
//printf("aSumArr %d \n", aSumArr[j]);
//printf("bSumArr %d \n", bSumArr[j]);
aSum += powf( aSumArr[j], 2 );
bSum += powf( bSumArr[j], 2 );
// printf("aSum: %d + aSumArr %d \n", aSum, aSumArr[j]);
// printf("bSum: %d + bSumArr %d \n", bSum, bSumArr[j]);
//Print out the dot prudct
//printf("Dot product is: %d \n", dotSum);
//printf("aSum is: %d \n", aSum);
//printf("bSum is: %d \n", bSum);
float sqSum1 = sqrtf(aSum);
float sqSum2 = sqrtf(bSum);
// printf("sqSum1: %f \n", sqSum1);
// printf("sqSum2: %f \n", sqSum2);
float sqSum = sqSum1 * sqSum2;
// printf("sqSum %f \n", sqSum);
float div = dotSum / sqSum ;
// printf("div: %f \n", div);
finalValue = acosf( div ) ;
//Stores the threads final value in array cc, in the respected index
if(finalValue == finalValue){
cc[threadId] = finalValue;
cc[threadId] = -2;
//printf("final value is: %f for number %d \n", finalValue, threadId);
答案 0 :(得分:1)
for(int i = 0; i < ARRAYSIZE; i++){
c[i] = 0;
aSumArr[i] = 0;
bSumArr[i] = 0;
binaryNumber[i] = 0; // add this line