cudaMemcpy之后数组的值不变,有什么想法吗?

时间:2016-06-18 23:27:46

标签: cuda

所以,我正在尝试从this演示文稿重新创建程序,但它只包含内核代码。我自己尝试编写“主要”代码,但似乎cudaMemcpy无效。这是我的代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <cstdlib>
#include <iostream>

#define N 1024
#define blockSize 256
#define numberOfBlocks N/(blockSize*2)

__global__ void reduce(int *g_idata, int *g_odata){
    extern __shared__ int sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + tid;
    sdata[tid] = g_idata[i]*g_idata[i] + g_idata[i+blockDim.x]*g_idata[i+blockDim.x];
    __syncthreads();
    if (blockSize >= 512){
        if (tid < 256){
            sdata[tid] += sdata[tid + 256];
        }
        __syncthreads();
    }
    if (blockSize >= 256){
        if (tid < 128){
        sdata[tid] += sdata[tid + 128];
        }
        __syncthreads();
    }
    if (blockSize >= 128){
        if (tid < 64){
            sdata[tid] += sdata[tid + 64];
        }
        __syncthreads();
    }
    if (tid < 32) {
        if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
        if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
        if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
        if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
        if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
        if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
    }
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

int main()
{
    int A[N];
    for (int i = 0; i < N; i++){
        A[i] = 1;
    }
    int* devA;
    cudaMalloc((int **)&devA, sizeof(int) * N);
    cudaMemcpy(devA, &A, sizeof(int) * N, cudaMemcpyHostToDevice);
    int output[numberOfBlocks];
    for (int i = 0; i < numberOfBlocks; i++){
        output[i] = 8;
    }
    int* devOutput;
    cudaMalloc((int **)&devOutput, sizeof(int) * numberOfBlocks);
    cudaMemcpy(devOutput, &output, sizeof(int) * numberOfBlocks, cudaMemcpyHostToDevice);
    reduce<<<numberOfBlocks, blockSize>>>(devA, devOutput);
    cudaMemcpy(&output, devOutput, sizeof(int) * numberOfBlocks, cudaMemcpyDeviceToHost);
    int sum=0;
    for (int j = 0; j < numberOfBlocks; j++){
        sum+=output[j];
    }
    printf("output 1=%d\n",output[0]);
    printf("output 2=%d\n",output[1]);
    printf("sum=%d\n",sum);
    printf("sum=%f\n",sqrt(float(sum)));
    system("pause");
}

这就是我得到的:

output[0]=8
output[1]=8
sum=16
sqrt ofsum=4.0000

所以我觉得cudaMemcpy没有更新“输出”值,我不明白为什么。如果你有,请分享。

0 个答案:

没有答案