CUDA:使用具有共享内存的全局线程索引将不起作用

时间:2014-07-07 17:42:33

标签: c++ pointers memory cuda shared

有人可以解释为什么当我的共享内存指针数组TMS在第0个索引以外的某个索引(在最后一行中发生)访问时,我的内核无法工作吗?如果在最后一行使用TMS [0],则一切都按预期工作。当我将TMS [0]更改为任何其他索引时,我收到CUDA意外错误。假设一个块上有64个线程。

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include <cuda_runtime.h>
#include <curand_kernel.h>

__global__ void myKern(float *masterForces)
{
    int globalIdx = ...// set global thread id

    volatile __shared__ float uniques[64];

    {
        uniques[globalIDx] = 0;
    }

    __syncthreads();


    volatile __shared__ float *TMS[64]; 

    {
       TMS[globalIdx] = (&uniques[globalIdx]);
    }

    __syncthreads();

    masterForces[globalIdx] = *TMS[1];
}

原创背景,如果您有点好奇:(您真的不需要看这个来解决我的问题)

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>


__global__ void myKern(const float *transMatrix, const int *pointerMatrix, float *masterForces, const double *rands, const int r_max)
{




int globalIdx = ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + (threadIdx.x + (threadIdx.y * blockDim.x));

volatile __shared__ float uniques[51];

uniques[0] = transMatrix[0]; uniques[1] = transMatrix[1]; uniques[2] = transMatrix[2]; // 1
uniques[3] = transMatrix[3]; uniques[4] = transMatrix[4]; uniques[5] = transMatrix[12]; // 2
uniques[6] = transMatrix[14]; uniques[7] = transMatrix[15]; uniques[8] = transMatrix[24]; // 3
uniques[9] = transMatrix[26]; uniques[10] = transMatrix[27]; uniques[11] = transMatrix[28]; // 4
uniques[12] = transMatrix[40]; uniques[13] = transMatrix[50]; uniques[14] = transMatrix[60]; // 5
uniques[15] = transMatrix[62]; uniques[16] = transMatrix[146]; uniques[17] = transMatrix[156]; // 6
uniques[18] = transMatrix[158]; uniques[19] = transMatrix[168]; uniques[20] = transMatrix[170]; // 7
uniques[21] = transMatrix[172]; uniques[22] = transMatrix[184]; uniques[23] = transMatrix[290]; // 8
uniques[24] = transMatrix[300]; uniques[25] = transMatrix[302]; uniques[26] = transMatrix[312]; // 9
uniques[27] = transMatrix[314]; uniques[28] = transMatrix[316]; uniques[29] = transMatrix[328]; // 10
uniques[30] = transMatrix[1010]; uniques[31] = transMatrix[1020]; uniques[32] = transMatrix[1022]; // 11
uniques[33] = transMatrix[1032]; uniques[34] = transMatrix[1034]; uniques[35] = transMatrix[1036]; // 12
uniques[36] = transMatrix[1048]; uniques[37] = transMatrix[1154]; uniques[38] = transMatrix[1164]; // 13
uniques[39] = transMatrix[1166]; uniques[40] = transMatrix[1176]; uniques[41] = transMatrix[1178]; // 14
uniques[42] = transMatrix[1180]; uniques[43] = transMatrix[1192]; uniques[44] = transMatrix[2018]; // 15
uniques[45] = transMatrix[2028]; uniques[46] = transMatrix[2030]; uniques[47] = transMatrix[2040]; // 16
uniques[48] = transMatrix[2042]; uniques[49] = transMatrix[2044]; uniques[50] = transMatrix[2056]; // 17

__syncthreads();


volatile __shared__ float *TMS[2592]; 

  for (int t=0; t<2592; t++)    
  {
    for (int m=0; m< 51; m++){
       if (pointerMatrix[t] == m)
       {
        TMS[t] = (&uniques[m]);
       }
  }
__syncthreads();


int b0 = 0;
int c0 = 0;
int d0 = 0;
int e0 = 0;
int f0 = 0;
int g0 = 0;
int h0 = 0;
int i0 = 0;
int j0 = 0;
int k0 = 0;
int l0 = 0;
int m0 = 0;
int n0 = 0;
int o0 = 0;
int p0 = 0;
int q0 = 0;
int r0 = 0;
int s0 = 0;
int t0 = 0;
int u0 = 0;
int v0 = 0;
int w0 = 0;
int x0 = 0;
int y0 = 0;




int index = 0;
float r = 0.0;
float temp = 0;

int RUsnapshot = 0; 
int leftsnap = 0;

curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);



for (int i =0; i < 160000; i++) //@@@depends on iterations @@@@@
{
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = b0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((0 * 6 + c0) * 6  + b0) * 2) * 6) ;

            b0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = b0;
        ///////////////////////////////////////////////////////////    
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = c0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + d0) * 6  + c0) * 2) * 6) ;

            c0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = c0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = d0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + e0) * 6  + d0) * 2) * 6) ;

            d0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = d0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = e0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + f0) * 6  + e0) * 2) * 6) ;

            e0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = e0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = f0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + g0) * 6  + f0) * 2) * 6) ;

            f0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = f0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = g0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + h0) * 6  + g0) * 2) * 6) ;

            g0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = g0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = h0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + i0) * 6  + h0) * 2) * 6) ;

            h0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = h0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = i0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + j0) * 6  + i0) * 2) * 6) ;

            i0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = i0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = j0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + k0) * 6  + j0) * 2) * 6) ;

            j0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = j0;
        ///////////////////////////////////////////////////////////    
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = k0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + l0) * 6  + k0) * 2) * 6) ;

            k0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = k0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = l0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + m0) * 6  + l0) * 2) * 6) ;

            l0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = l0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = m0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + n0) * 6  + m0) * 2) * 6) ;

            m0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = m0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = n0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + o0) * 6  + n0) * 2) * 6) ;

            n0 += ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = n0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = o0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + p0) * 6  + o0) * 2) * 6) ;

            o0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = o0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = p0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + q0) * 6  + p0) * 2) * 6) ;

            p0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = p0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = q0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + r0) * 6  + q0) * 2) * 6) ;

            q0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = q0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = r0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + s0) * 6  + r0) * 2) * 6) ;

            r0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = r0;
        ///////////////////////////////////////////////////////////    
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = s0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + t0) * 6  + s0) * 2) * 6) ;

            s0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = s0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = t0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + u0) * 6  + t0) * 2) * 6) ;

            t0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = t0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot =u0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + v0) * 6  + u0) * 2) * 6) ;

            u0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap =u0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = v0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + w0) * 6  + v0) * 2) * 6) ;

            v0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = v0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = w0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + x0) * 6  + w0) * 2) * 6) ;

            w0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = w0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = x0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + y0) * 6  + x0) * 2) * 6) ;

            x0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = x0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = y0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + 0) * 6  + y0) * 2) * 6) ;

            y0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;


        ///////////////////////////////////////////////////////////   






            temp = (b0 ==4) + (b0 ==5) + (c0 ==4) + (c0 ==5) + (d0 ==4) + (d0 ==5) + (e0 ==4) + (e0 ==5) + (f0 ==4) + (f0 ==5) + 
                   (g0 ==4) + (g0 ==5) + (h0 ==4) + (h0 ==5) + (i0 ==4) + (i0 ==5) + (j0 ==4) + (j0 ==5) + (k0 ==4) + (k0 ==5) + 
                   (l0 ==4) + (l0 ==5) + (m0 ==4) + (m0 ==5) + (n0 ==4) + (n0 ==5) + (o0 ==4) + (o0 ==5) + (p0 ==4) + (p0 ==5) + 
                   (q0 ==4) + (q0 ==5) + (r0 ==4) + (r0 ==5) + (s0 ==4) + (s0 ==5) + (t0 ==4) + (t0 ==5) + (u0 ==4) + (u0 ==5) + 
                   (v0 ==4) + (v0 ==5) + (w0 ==4) + (w0 ==5) + (x0 ==4) + (x0 ==5) + (y0 ==4) + (y0 ==5);


        masterForces[globalIdx + (r_max * i)] = *TMS[1]; 
        temp = 0.0;

}



}

}

2 个答案:

答案 0 :(得分:4)

假设您的全局线程索引(globalIdx)实际上是一个全局线程索引,例如:

int globalIdx = threadIdx.x+blockDim.x*blockIdx.x; // e.g. for 1D grid/threadblock

然后,当全局线程索引超过63时,您无法索引到64条目共享内存数组:

    uniques[globalIDx] = 0;

   TMS[globalIdx] = (&uniques[globalIdx]);

相反,如果仅使用线程索引:

    uniques[threadIdx.x] = 0;

和你的代码中的其他地方类似,你应该可以,至少为索引到共享内存数组,假设每个块有64个线程。

这是一个完全有效的例子:

$ cat t463.cu
#include <stdio.h>
#define DSIZE 128
#define nTPB 64
__global__ void myKern(float *masterForces)
{
    int globalIdx = threadIdx.x+blockDim.x*blockIdx.x;

    volatile __shared__ float uniques[nTPB];

    {
        uniques[threadIdx.x] = 0;
    }

    __syncthreads();


    volatile __shared__ float *TMS[nTPB];

    {
       TMS[threadIdx.x] = &(uniques[threadIdx.x]);
    }

    __syncthreads();

    masterForces[globalIdx] = *TMS[1];
}

int main(){

  float *d_data, *h_data;
  h_data=(float *)malloc(DSIZE*sizeof(float));
  cudaMalloc(&d_data, DSIZE*sizeof(float));
  for (int i = 0; i< DSIZE; i++)
    h_data[i] = 1.0f;
  cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
  myKern<<<DSIZE/nTPB, nTPB>>>(d_data);
  cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i<DSIZE; i++)
    if (h_data[i] != 0.0f) {printf("mismatch at %d, was: %f should be: %f\n", i, h_data[i], 0.0f); return 1;}
  printf("Success\n");
  return 0;
}
$ nvcc -arch=sm_20 -o t463 t463.cu
$ cuda-memcheck ./t463
========= CUDA-MEMCHECK
Success
========= ERROR SUMMARY: 0 errors
$

将来,请提供一个完整的示例来说明问题,例如这样的问题。因此,如果您不提供问题,这是一个有效的接近原因。

答案 1 :(得分:0)

您的示例代码不应出现Robert Crovella证明的任何错误。我查看了您的原始代码,并通过第一个“for”循环找到了一个未对齐的括号。祝你好运......