所以,我正在尝试从this演示文稿重新创建程序,但它只包含内核代码。我自己尝试编写“主要”代码,但似乎cudaMemcpy无效。这是我的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <cstdlib>
#include <iostream>
#define N 1024
#define blockSize 256
#define numberOfBlocks N/(blockSize*2)
__global__ void reduce(int *g_idata, int *g_odata){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x*2) + tid;
sdata[tid] = g_idata[i]*g_idata[i] + g_idata[i+blockDim.x]*g_idata[i+blockDim.x];
__syncthreads();
if (blockSize >= 512){
if (tid < 256){
sdata[tid] += sdata[tid + 256];
}
__syncthreads();
}
if (blockSize >= 256){
if (tid < 128){
sdata[tid] += sdata[tid + 128];
}
__syncthreads();
}
if (blockSize >= 128){
if (tid < 64){
sdata[tid] += sdata[tid + 64];
}
__syncthreads();
}
if (tid < 32) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main()
{
int A[N];
for (int i = 0; i < N; i++){
A[i] = 1;
}
int* devA;
cudaMalloc((int **)&devA, sizeof(int) * N);
cudaMemcpy(devA, &A, sizeof(int) * N, cudaMemcpyHostToDevice);
int output[numberOfBlocks];
for (int i = 0; i < numberOfBlocks; i++){
output[i] = 8;
}
int* devOutput;
cudaMalloc((int **)&devOutput, sizeof(int) * numberOfBlocks);
cudaMemcpy(devOutput, &output, sizeof(int) * numberOfBlocks, cudaMemcpyHostToDevice);
reduce<<<numberOfBlocks, blockSize>>>(devA, devOutput);
cudaMemcpy(&output, devOutput, sizeof(int) * numberOfBlocks, cudaMemcpyDeviceToHost);
int sum=0;
for (int j = 0; j < numberOfBlocks; j++){
sum+=output[j];
}
printf("output 1=%d\n",output[0]);
printf("output 2=%d\n",output[1]);
printf("sum=%d\n",sum);
printf("sum=%f\n",sqrt(float(sum)));
system("pause");
}
这就是我得到的:
output[0]=8
output[1]=8
sum=16
sqrt ofsum=4.0000
所以我觉得cudaMemcpy没有更新“输出”值,我不明白为什么。如果你有,请分享。