Question

所以我有一些神经网络模拟器代码可以在CPU上正常工作，并行版本与串行版本一致至少6个小数位，在Win7 PC下我的两个CUDA上都有32个线程的单个块，但是使用1个块和64个线程生成稍微不同的Wt值。 Wt值通常不超过3个小数位，当我尝试通过在循环中嵌入__syncthreads（）来消除竞争条件时，Wt值在复制回CPU时显示为非数字。

有人可以暗示我可能做错了吗？我已经包含了下面的并行化代码，并且使用lSampleQtyReq = 10000，o = 1和Option ='R'来调用knlBackProp：

// device-global variables to facilitate data transfer
__device__ __constant__ __align__(8) struct rohanContext devSes;
__device__ __constant__ struct rohanLearningSet devLearn;
__device__ __align__(16) struct rohanNetwork devNet;

__device__ double devdReturn[1024*1024];
__device__ double devdRMSE=0;
__device__ int devlReturn[1024*1024];
__device__ int devlTrainable=0;

extern"C"
int knlBackProp(struct rohanContext& rSes, long lSampleQtyReq, long o, char Option)
{mIDfunc /*! divides error in yielded values and back-propagates corrections among weights */
// Option S - single sample correction only
// Option E - keep existing weights, count trainable samples only
// Option R - perform corrections for all trainable samples
    int lTotal=0;

    cudaMemcpyToSymbol( "devlTrainable", &lTotal, sizeof(int) ); // init return value on both sides
        mCheckCudaWorked
    cudaEvent_t start, stop;
    cudaEventCreate( &start);
    cudaEventCreate( &stop);

            cudaEventRecord( start, 0);
        mtkBackPropMT<<< rSes.iBpropBlocks , rSes.iBpropThreads >>>( lSampleQtyReq, o, Option);
            cudaEventRecord( stop, 0);
            mCheckCudaWorked

    cudaMemcpyFromSymbol( &lTotal, "devlTrainable", sizeof(long) ); // retrieve return value
        mCheckCudaWorked
    cudaEventSynchronize( stop);
        float elapsedTime;
        cudaEventElapsedTime( &elapsedTime, start, stop);
    conPrintf("DEVICE: Time to complete BackProp kernel: %3.1f ms\n", elapsedTime);
        cudaEventDestroy( start);
        cudaEventDestroy( stop);

    return lTotal;
}


__global__ __device__ void mtkBackPropMT( long lSampleQtyReq, long o, char Option)
{/*! divides error in yielded values and back-propagates corrections among weights */
// Option S - single sample correction only
// Option E - keep existing weights, count trainable samples only
// Option R - perform corrections for all trainable samples

    if(Option=='E' || Option=='e'){ //
        devlTrainable=0; // reset global mem trainable counter
        subkBackPropEoptMT(lSampleQtyReq, o);
    }

    if(Option=='S' || Option=='s'){
        devlTrainable=0; // reset global mem trainable counter
        subkBackPropSoptMT(lSampleQtyReq, false,  devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval);
    }

    if(Option=='R' || Option=='r'){ //
        devlTrainable=0; // reset global mem trainable counter
        subkBackPropRoptMT(lSampleQtyReq, o);
    }
}


__device__ void subkBackPropRoptMT(long lSampleQtyReq, long o)
{/*! flags and counts samples meeting  */
    long OUTROWLEN=devLearn.iOutputQty+1; // prepare array index and width
    //long tIx = threadIdx.x + devSes.iEvalThreads * blockIdx.x; // tIx is thread index over the kernel
    long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel
    //long lTotalThreads = devSes.iBpropThreads * devSes.iBpropBlocks; // total number of threads
    double maxSquared = devSes.dMAX * devSes.dMAX ; //needed to compart to stored delta squared values

    devlTrainable=0; // clear global mem accumulator; out of bound samples will remain at this value
    for (long s=0; s<lSampleQtyReq; ++s){ // iterate over samples
        if( devLearn.gpudSE1024[IDX2C( o, s, OUTROWLEN )] > maxSquared ){ // if the MAX criterion is exceeded   
            if(tIx==0)++devlTrainable; // increment the counter
            subkBackPropSoptMT( s, true, devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval);
        }
    }
} 


__device__ void subkBackPropSoptMT(long s, int o, rohanNetwork& Net, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * Deltas, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval )
{/*! propagates adjustment of weights backwards preceeding layers from the chosen network output. */
    // s is sample's index
    // o is an optional method selection parameter; print/don't print as of 2/29/12
    long index, kindex; // for warpwise loops
    long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel
    long lTotalThreads = gridDim.x * blockDim.x; // total number of threads
    const cuDoubleComplex cdcZero = { 0, 0 };

    /* clear all temp values BP0 */
    for (long offset=0; (index =offset+tIx)< MAXNEURONS ; offset+=lTotalThreads){ // index stands for i
        Deltas[index]=cdcZero;
        Signals[index]=cdcZero;
        Zs[index]=cdcZero;
    }
    /* re-evaluate sample to load temp values. BPI */
    subkEvalSampleBetaMT( devSes, s, Net, (s==0), Signals, Zs, Wt, XInputs, YEval, dYEval);
    /* begin error calculation. BPII */
    cuDoubleComplex Deltastar /* measured error at the chosen network output. */ ;
    /* calc top layer deltas. */
    long TOP=Net.iLayerQty-1;
    int ROWLEN=Net.iNeuronQTY[TOP];
    //for(int i=0; i<Net.iNeuronQTY[TOP]; ++i){
    for (long offset=0; (index =offset+tIx)< Net.iNeuronQTY[TOP] ; offset+=lTotalThreads){ // index stands for i
         // delta-star = D - Y = Desired output minus actual output from evaluation
         // D is the cplx coords of the sector of the desired answer        Y is the complex result of evaluation of the given sample, unactivated. */
        Deltastar = CxSubtractCxUT( 
                        devLearn.gpuDOutputs[ IDX2C( index, s, ROWLEN ) ], 
                        Signals[Net.iNeuronOfst[TOP]+index] );
         /* divide the correction; delta = alpha * delta-star / n+1 (but alpha is always 1 for now). */
        //Deltas[Net.iNeuronOfst[TOP]+index] = CxDivideRlUT( Deltastar, Net.iDendrtQTY[TOP] );
        Deltas[Net.iNeuronOfst[TOP]+index] = CxMultiplyRlUT( Deltastar, Net.dINV_S[TOP] );
    }
    __syncthreads();
    /* Now distribute the correction to lower layers if any. BPII.1 */
    if (Net.iLayerQty>2){  /* remember layer 0 = inputs, layer 1 = bottom row, layer {2..iLayerQty-2} = middle row, layer iLayerQty-1 = top row. */
        for (int L=Net.iLayerQty-1; L>1; --L){
            long LAY = L; /* setup access to layers. */
            long TRIB = L-1; /* trib for tributary.*/
            int iTributQTY=Net.iNeuronQTY[TRIB];
            //int Sj=Net.iDendrtQTY[TRIB]; if (TRIB==1) Sj=1; // Sj=1 for firest hidden layer
            for (int i=1; i<Net.iNeuronQTY[LAY]; ++i) { // skip 0th neuron as its weights are either 1 (div identity) or 0 (div forbidden) and don't change anyway
                // k index must begin at 1, neuron zero not valid for correction
                //for (int k=1; k<iTributQTY; ++k) { /* the contribution to ith neuron's kth tributary's delta = i's delta/i's weight k. */
                for (long offset=1; ( kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k
                                  Deltas[Net.iNeuronOfst[TRIB]+kindex] 
                    = CxAddCxUT ( Deltas[Net.iNeuronOfst[TRIB]+kindex] , 
                        CxDivideCxUT( 
                            Deltas[Net.iNeuronOfst[LAY]+i] , 
                            Wt[IDX2C( Net.iWeightOfst[LAY]+kindex, i, iTributQTY )] ));
                }
            }
            for (long offset=1; ( kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k
                //cuDoubleComplex preDiv=Deltas[Net.iNeuronOfst[TRIB]+kindex]; // diagnostic purpose only, remove if removing other diags
                //Deltas[Net.iNeuronOfst[TRIB]+kindex] 
                //  = CxDivideRlUT( 
                //      Deltas[Net.iNeuronOfst[TRIB]+kindex] , 
                //      Sj );
                Deltas[Net.iNeuronOfst[TRIB]+kindex] 
                    = CxMultiplyRlUT( 
                        Deltas[Net.iNeuronOfst[TRIB]+kindex] , 
                        Net.dINV_S[TRIB] );
            }
        }
    }
    __syncthreads();
    /* error distribution completed */
    /* and now update the weights BP III */
    /* adj weights on first hidden layer. */
        int FHID = 1;
        int SIG = 0;
        int iSignalQTY=Net.iNeuronQTY[SIG]; //rSes.rLearn->iInputQty+1;
        int iHidWidth=Net.iNeuronQTY[FHID];
    for (int k=1; k<iHidWidth; ++k){
        //for (int i=0; i<iSignalQTY; ++i){  
        for (long offset=0; ( index =offset+tIx)< iSignalQTY ; offset+=lTotalThreads){ // index stands for i
            /* dW=d*xbar/s1/|z|= neuron's delta * input's conjugate / ( dendrites+1 * abs of input i ). */
                        Wt[IDX2C( Net.iWeightOfst[FHID]+index, k, iSignalQTY )]
            =CxAddCxUT( Wt[IDX2C( Net.iWeightOfst[FHID]+index, k, iSignalQTY )] , 
                CxDivideRlUT( 
                    CxMultiplyCxUT( 
                        Deltas[Net.iNeuronOfst[FHID]+k] , 
                        CxConjugateUT( Signals[Net.iNeuronOfst[SIG]+index] ) 
                    ) , 
                    CxAbsUT( Zs[Net.iNeuronOfst[FHID]+k] ) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12
                )
            );
        }
    }
    __syncthreads();
    /* re-evaluate sample to update temp values. */
    subkEvalSampleBetaMT( devSes, s, Net, false, Signals, Zs, Wt, XInputs, YEval, dYEval);
    if (Net.iLayerQty>2){
         /* now use those outputs' conjugates and the deltas to adjust middle layers. BP III.1 */
        for (int L=2; L<Net.iLayerQty-1; ++L){
             /* setup access to layers. */
            long LAY = L;
            long TRIB = L-1;
            //int iLayWidth=Net.iNeuronQTY[LAY];
            int iTribWidth=Net.iNeuronQTY[TRIB];
            for (int k=1; k<Net.iNeuronQTY[LAY]; ++k){
                //for (int i=0; i<Net.iNeuronQTY[TRIB]; ++i){  
                for (long offset=0; ( index =offset+tIx)< Net.iNeuronQTY[TRIB] ; offset+=lTotalThreads){ // index stands for i
                    /* the adjustment added to kth neuron's ith trib's weight = k's delta * complex conjugate of i's signal / (abs of k's previous-wt product-sum * dendrites+1)  . */
                                Wt[IDX2C( Net.iWeightOfst[LAY]+index, k, iTribWidth )]
                    =CxAddCxUT( Wt[IDX2C( Net.iWeightOfst[LAY]+index, k, iTribWidth )] , 
                        CxDivideRlUT( 
                            CxMultiplyCxUT( 
                                Deltas[Net.iNeuronOfst[LAY]+k] , 
                                CxConjugateUT( Signals[Net.iNeuronOfst[TRIB]+index] ) 
                            ) ,
                            ( 
                                CxAbsUT( Zs[Net.iNeuronOfst[LAY]+k] ) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12
                            )
                        )
                    );
                }
            }
            /* layer is complete. */
            subkEvalSampleBetaMT( devSes, s, Net, true, Signals, Zs, Wt, XInputs, YEval, dYEval);
        }
    }
    __syncthreads();

    /* correct output layer BP III.3 */
    long SUB = TOP-1; 
    //int iTopWidth=Net.iNeuronQTY[TOP];
    int iSubWidth=Net.iNeuronQTY[SUB];

    for (int k=1; k<Net.iNeuronQTY[TOP]; ++k){
        //for (int i=0; i<Net.iNeuronQTY[SUB]; ++i){  
        for (long offset=0; ( index =offset+tIx)< Net.iNeuronQTY[SUB] ; offset+=lTotalThreads){ // index stands for i
            /* For last layer only, adjustment to kth neuron's ith weight = k's delta * complex conjugate of i's signal / ( dendrites+1)  . */
                        Wt[IDX2C( Net.iWeightOfst[TOP]+index, k, iSubWidth )]
            =CxAddCxUT( Wt[IDX2C( Net.iWeightOfst[TOP]+index, k, iSubWidth )] , 
                CxMultiplyCxUT( 
                    Deltas[Net.iNeuronOfst[TOP]+k] , 
                    CxConjugateUT( Signals[Net.iNeuronOfst[SUB]+index] ) 
                )
            );  // N+1 denominator factor is considered redundant - JAW & IA 2/27/12
        }
    }
    /* backprop is complete. */
}


__device__ void subkEvalSampleBetaMT(rohanContext& Ses, long s, rohanNetwork& Net, int o, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval )
{// Beta uses fixed length fields instead of nested pointer layers
    // delta squared is not updated, since they'll be updated when RMSE is checked at the end of a pass through the learning set
    long index, kindex; // for warpwise loops
    long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel
    long lTotalThreads = gridDim.x * blockDim.x; // total number of threads
    const cuDoubleComplex cdcZero = { 0, 0 };
     /*! layer zero (inputs) is special. */
    long INROWLEN=Net.iNeuronQTY[0];//rSes.rLearn->iInputQty+1;
    //for (int i=0; i<INROWLEN; ++i){
    for (long offset=0; (index =offset+tIx)< INROWLEN ; offset+=lTotalThreads){ // index stands for i
        Signals[Net.iNeuronOfst[0]+index]= XInputs[IDX2C( index, s, INROWLEN )];
    }
     /*! middle and top layers. */
    for (int L=1; L<Net.iLayerQty; ++L){
        //struct rohanLayer& lay = Net.rLayer[L];
        long LAY=L;
        int TRIB=L-1; // index of previous layer
        int iNeuronQTY=Net.iNeuronQTY[LAY];
        int iSignalQTY=Net.iDendrtQTY[LAY]; // signal qty depends on size of previous layer
        //for (int k=0; k<iNeuronQTY; ++k){ //Neuron zero is not skipped, its output should be 1+0i as a check
        for (long offset=0; (kindex =offset+tIx)< iNeuronQTY ; offset+=lTotalThreads){ // kindex stands for k
            Zs[Net.iNeuronOfst[LAY]+kindex]=cdcZero;
            for (int i=0; i<iSignalQTY; ++i){ //walk weights on inputs from previous layer
                           Zs[Net.iNeuronOfst[LAY]+kindex] = 
                CxAddCxUT( Zs[Net.iNeuronOfst[LAY]+kindex] , 
                    CxMultiplyCxUT(
                        Wt[IDX2C( Net.iWeightOfst[LAY] + i, kindex, iSignalQTY )],
                        Signals[Net.iNeuronOfst[TRIB]+i] ) ) ;
            }
            // ACTIVATE //
            Signals[Net.iNeuronOfst[LAY]+kindex] = CxActivateUT( Zs[Net.iNeuronOfst[LAY]+kindex]);
        }
    }
    /*! last layer values are converted and stored here */
    long TOP = Net.iLayerQty-1;
    long OUTROWLEN=Net.iNeuronQTY[TOP];
    //for (int i=0; i<Net.iNeuronQTY[TOP]; ++i){ // continuous conversion begins here 
    for (long offset=0; (index =offset+tIx)< OUTROWLEN ; offset+=lTotalThreads){ // index stands for i
        YEval[IDX2C( index, s, OUTROWLEN )]= Signals[Net.iNeuronOfst[TOP]+index] ; // store final complex output(s)
        dYEval[IDX2C( index, s, OUTROWLEN )]=FUnitCxUT( YEval[IDX2C( index, s, OUTROWLEN )] ) * Net.iSectorQty; // convert final complex outputs to sectors and store that
        if(devLearn.iContOutputs==false) // round off decimal if disc activation is set
            dYEval[IDX2C( index, s, OUTROWLEN )]=int(dYEval[IDX2C( index, s, OUTROWLEN )]);
    }
     /*! end of sample evaluation. */
}

__device__ cuDoubleComplex CxActivateUT(const cuDoubleComplex Z)
{/// applies ContActivation or discrete activation function to cx neuron output and returns Phi(Z)
    /// This fn should be phased out in favor of a GPU device vector based fn
    cuDoubleComplex phi;
    if (devNet.bContActivation) { // apply ContActivation activation function to weighted sum : phi(z)=z/|z|
        phi = CxDivideRlUT( Z, CxAbsUT( Z ) );
    }
    else {  // apply Discrete activation function to weighted sum : s=int(arctan(z)*k/2pi), phi(z)=(X(s),Y(s))
        double theta = atan2(Z.y, Z.x); // theta = arctan y/x
        int iSector = (int)((theta * devNet.dK_DIV_TWO_PI) + devNet.iSectorQty) % devNet.iSectorQty;
        phi = devNet.gpuSectorBdry[iSector];
        //printf(" %f+%fi %d Activate\n", phi.x, phi.y, threadIdx.x);
    }
    return phi;
}

Answer 1

所以，我不打算阅读所有代码，但我可以给你一个强烈的暗示。 warp大小是32个线程，因此64线程的情况下将运行两个warp / block - 在前一种情况下你不能有任何基于指令指针的竞争条件，但是，在第二种情况下，你将有效地拥有两个组在不同时间安排的具有不同IP的线程。你可能已经知道了很多（因此同义词），但上面的确让你几乎可以肯定你还有一个你尚未考虑的竞争条件。

放入同步线程是尝试隔离它的良好开端。你确定在你的循环中，一个warp的源数据不会被另一个warp覆盖吗？如果不是，请尝试将syncthreads放入内部循环中，仅用于调试目的，以查看可能导致竞争条件的原因。

不同数量的线程，不同的答案

1 个答案: