所以我有一些神经网络模拟器代码可以在CPU上正常工作,并行版本与串行版本一致至少6个小数位,在Win7 PC下我的两个CUDA上都有32个线程的单个块,但是使用1个块和64个线程生成稍微不同的Wt值。 Wt值通常不超过3个小数位,当我尝试通过在循环中嵌入__syncthreads()来消除竞争条件时,Wt值在复制回CPU时显示为非数字。
有人可以暗示我可能做错了吗?我已经包含了下面的并行化代码,并且使用lSampleQtyReq = 10000,o = 1和Option ='R'来调用knlBackProp:
// device-global variables to facilitate data transfer
__device__ __constant__ __align__(8) struct rohanContext devSes;
__device__ __constant__ struct rohanLearningSet devLearn;
__device__ __align__(16) struct rohanNetwork devNet;
__device__ double devdReturn[1024*1024];
__device__ double devdRMSE=0;
__device__ int devlReturn[1024*1024];
__device__ int devlTrainable=0;
extern"C"
int knlBackProp(struct rohanContext& rSes, long lSampleQtyReq, long o, char Option)
{mIDfunc /*! divides error in yielded values and back-propagates corrections among weights */
// Option S - single sample correction only
// Option E - keep existing weights, count trainable samples only
// Option R - perform corrections for all trainable samples
int lTotal=0;
cudaMemcpyToSymbol( "devlTrainable", &lTotal, sizeof(int) ); // init return value on both sides
mCheckCudaWorked
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord( start, 0);
mtkBackPropMT<<< rSes.iBpropBlocks , rSes.iBpropThreads >>>( lSampleQtyReq, o, Option);
cudaEventRecord( stop, 0);
mCheckCudaWorked
cudaMemcpyFromSymbol( &lTotal, "devlTrainable", sizeof(long) ); // retrieve return value
mCheckCudaWorked
cudaEventSynchronize( stop);
float elapsedTime;
cudaEventElapsedTime( &elapsedTime, start, stop);
conPrintf("DEVICE: Time to complete BackProp kernel: %3.1f ms\n", elapsedTime);
cudaEventDestroy( start);
cudaEventDestroy( stop);
return lTotal;
}
__global__ __device__ void mtkBackPropMT( long lSampleQtyReq, long o, char Option)
{/*! divides error in yielded values and back-propagates corrections among weights */
// Option S - single sample correction only
// Option E - keep existing weights, count trainable samples only
// Option R - perform corrections for all trainable samples
if(Option=='E' || Option=='e'){ //
devlTrainable=0; // reset global mem trainable counter
subkBackPropEoptMT(lSampleQtyReq, o);
}
if(Option=='S' || Option=='s'){
devlTrainable=0; // reset global mem trainable counter
subkBackPropSoptMT(lSampleQtyReq, false, devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval);
}
if(Option=='R' || Option=='r'){ //
devlTrainable=0; // reset global mem trainable counter
subkBackPropRoptMT(lSampleQtyReq, o);
}
}
__device__ void subkBackPropRoptMT(long lSampleQtyReq, long o)
{/*! flags and counts samples meeting */
long OUTROWLEN=devLearn.iOutputQty+1; // prepare array index and width
//long tIx = threadIdx.x + devSes.iEvalThreads * blockIdx.x; // tIx is thread index over the kernel
long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel
//long lTotalThreads = devSes.iBpropThreads * devSes.iBpropBlocks; // total number of threads
double maxSquared = devSes.dMAX * devSes.dMAX ; //needed to compart to stored delta squared values
devlTrainable=0; // clear global mem accumulator; out of bound samples will remain at this value
for (long s=0; s<lSampleQtyReq; ++s){ // iterate over samples
if( devLearn.gpudSE1024[IDX2C( o, s, OUTROWLEN )] > maxSquared ){ // if the MAX criterion is exceeded
if(tIx==0)++devlTrainable; // increment the counter
subkBackPropSoptMT( s, true, devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval);
}
}
}
__device__ void subkBackPropSoptMT(long s, int o, rohanNetwork& Net, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * Deltas, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval )
{/*! propagates adjustment of weights backwards preceeding layers from the chosen network output. */
// s is sample's index
// o is an optional method selection parameter; print/don't print as of 2/29/12
long index, kindex; // for warpwise loops
long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel
long lTotalThreads = gridDim.x * blockDim.x; // total number of threads
const cuDoubleComplex cdcZero = { 0, 0 };
/* clear all temp values BP0 */
for (long offset=0; (index =offset+tIx)< MAXNEURONS ; offset+=lTotalThreads){ // index stands for i
Deltas[index]=cdcZero;
Signals[index]=cdcZero;
Zs[index]=cdcZero;
}
/* re-evaluate sample to load temp values. BPI */
subkEvalSampleBetaMT( devSes, s, Net, (s==0), Signals, Zs, Wt, XInputs, YEval, dYEval);
/* begin error calculation. BPII */
cuDoubleComplex Deltastar /* measured error at the chosen network output. */ ;
/* calc top layer deltas. */
long TOP=Net.iLayerQty-1;
int ROWLEN=Net.iNeuronQTY[TOP];
//for(int i=0; i<Net.iNeuronQTY[TOP]; ++i){
for (long offset=0; (index =offset+tIx)< Net.iNeuronQTY[TOP] ; offset+=lTotalThreads){ // index stands for i
// delta-star = D - Y = Desired output minus actual output from evaluation
// D is the cplx coords of the sector of the desired answer Y is the complex result of evaluation of the given sample, unactivated. */
Deltastar = CxSubtractCxUT(
devLearn.gpuDOutputs[ IDX2C( index, s, ROWLEN ) ],
Signals[Net.iNeuronOfst[TOP]+index] );
/* divide the correction; delta = alpha * delta-star / n+1 (but alpha is always 1 for now). */
//Deltas[Net.iNeuronOfst[TOP]+index] = CxDivideRlUT( Deltastar, Net.iDendrtQTY[TOP] );
Deltas[Net.iNeuronOfst[TOP]+index] = CxMultiplyRlUT( Deltastar, Net.dINV_S[TOP] );
}
__syncthreads();
/* Now distribute the correction to lower layers if any. BPII.1 */
if (Net.iLayerQty>2){ /* remember layer 0 = inputs, layer 1 = bottom row, layer {2..iLayerQty-2} = middle row, layer iLayerQty-1 = top row. */
for (int L=Net.iLayerQty-1; L>1; --L){
long LAY = L; /* setup access to layers. */
long TRIB = L-1; /* trib for tributary.*/
int iTributQTY=Net.iNeuronQTY[TRIB];
//int Sj=Net.iDendrtQTY[TRIB]; if (TRIB==1) Sj=1; // Sj=1 for firest hidden layer
for (int i=1; i<Net.iNeuronQTY[LAY]; ++i) { // skip 0th neuron as its weights are either 1 (div identity) or 0 (div forbidden) and don't change anyway
// k index must begin at 1, neuron zero not valid for correction
//for (int k=1; k<iTributQTY; ++k) { /* the contribution to ith neuron's kth tributary's delta = i's delta/i's weight k. */
for (long offset=1; ( kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k
Deltas[Net.iNeuronOfst[TRIB]+kindex]
= CxAddCxUT ( Deltas[Net.iNeuronOfst[TRIB]+kindex] ,
CxDivideCxUT(
Deltas[Net.iNeuronOfst[LAY]+i] ,
Wt[IDX2C( Net.iWeightOfst[LAY]+kindex, i, iTributQTY )] ));
}
}
for (long offset=1; ( kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k
//cuDoubleComplex preDiv=Deltas[Net.iNeuronOfst[TRIB]+kindex]; // diagnostic purpose only, remove if removing other diags
//Deltas[Net.iNeuronOfst[TRIB]+kindex]
// = CxDivideRlUT(
// Deltas[Net.iNeuronOfst[TRIB]+kindex] ,
// Sj );
Deltas[Net.iNeuronOfst[TRIB]+kindex]
= CxMultiplyRlUT(
Deltas[Net.iNeuronOfst[TRIB]+kindex] ,
Net.dINV_S[TRIB] );
}
}
}
__syncthreads();
/* error distribution completed */
/* and now update the weights BP III */
/* adj weights on first hidden layer. */
int FHID = 1;
int SIG = 0;
int iSignalQTY=Net.iNeuronQTY[SIG]; //rSes.rLearn->iInputQty+1;
int iHidWidth=Net.iNeuronQTY[FHID];
for (int k=1; k<iHidWidth; ++k){
//for (int i=0; i<iSignalQTY; ++i){
for (long offset=0; ( index =offset+tIx)< iSignalQTY ; offset+=lTotalThreads){ // index stands for i
/* dW=d*xbar/s1/|z|= neuron's delta * input's conjugate / ( dendrites+1 * abs of input i ). */
Wt[IDX2C( Net.iWeightOfst[FHID]+index, k, iSignalQTY )]
=CxAddCxUT( Wt[IDX2C( Net.iWeightOfst[FHID]+index, k, iSignalQTY )] ,
CxDivideRlUT(
CxMultiplyCxUT(
Deltas[Net.iNeuronOfst[FHID]+k] ,
CxConjugateUT( Signals[Net.iNeuronOfst[SIG]+index] )
) ,
CxAbsUT( Zs[Net.iNeuronOfst[FHID]+k] ) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12
)
);
}
}
__syncthreads();
/* re-evaluate sample to update temp values. */
subkEvalSampleBetaMT( devSes, s, Net, false, Signals, Zs, Wt, XInputs, YEval, dYEval);
if (Net.iLayerQty>2){
/* now use those outputs' conjugates and the deltas to adjust middle layers. BP III.1 */
for (int L=2; L<Net.iLayerQty-1; ++L){
/* setup access to layers. */
long LAY = L;
long TRIB = L-1;
//int iLayWidth=Net.iNeuronQTY[LAY];
int iTribWidth=Net.iNeuronQTY[TRIB];
for (int k=1; k<Net.iNeuronQTY[LAY]; ++k){
//for (int i=0; i<Net.iNeuronQTY[TRIB]; ++i){
for (long offset=0; ( index =offset+tIx)< Net.iNeuronQTY[TRIB] ; offset+=lTotalThreads){ // index stands for i
/* the adjustment added to kth neuron's ith trib's weight = k's delta * complex conjugate of i's signal / (abs of k's previous-wt product-sum * dendrites+1) . */
Wt[IDX2C( Net.iWeightOfst[LAY]+index, k, iTribWidth )]
=CxAddCxUT( Wt[IDX2C( Net.iWeightOfst[LAY]+index, k, iTribWidth )] ,
CxDivideRlUT(
CxMultiplyCxUT(
Deltas[Net.iNeuronOfst[LAY]+k] ,
CxConjugateUT( Signals[Net.iNeuronOfst[TRIB]+index] )
) ,
(
CxAbsUT( Zs[Net.iNeuronOfst[LAY]+k] ) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12
)
)
);
}
}
/* layer is complete. */
subkEvalSampleBetaMT( devSes, s, Net, true, Signals, Zs, Wt, XInputs, YEval, dYEval);
}
}
__syncthreads();
/* correct output layer BP III.3 */
long SUB = TOP-1;
//int iTopWidth=Net.iNeuronQTY[TOP];
int iSubWidth=Net.iNeuronQTY[SUB];
for (int k=1; k<Net.iNeuronQTY[TOP]; ++k){
//for (int i=0; i<Net.iNeuronQTY[SUB]; ++i){
for (long offset=0; ( index =offset+tIx)< Net.iNeuronQTY[SUB] ; offset+=lTotalThreads){ // index stands for i
/* For last layer only, adjustment to kth neuron's ith weight = k's delta * complex conjugate of i's signal / ( dendrites+1) . */
Wt[IDX2C( Net.iWeightOfst[TOP]+index, k, iSubWidth )]
=CxAddCxUT( Wt[IDX2C( Net.iWeightOfst[TOP]+index, k, iSubWidth )] ,
CxMultiplyCxUT(
Deltas[Net.iNeuronOfst[TOP]+k] ,
CxConjugateUT( Signals[Net.iNeuronOfst[SUB]+index] )
)
); // N+1 denominator factor is considered redundant - JAW & IA 2/27/12
}
}
/* backprop is complete. */
}
__device__ void subkEvalSampleBetaMT(rohanContext& Ses, long s, rohanNetwork& Net, int o, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval )
{// Beta uses fixed length fields instead of nested pointer layers
// delta squared is not updated, since they'll be updated when RMSE is checked at the end of a pass through the learning set
long index, kindex; // for warpwise loops
long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel
long lTotalThreads = gridDim.x * blockDim.x; // total number of threads
const cuDoubleComplex cdcZero = { 0, 0 };
/*! layer zero (inputs) is special. */
long INROWLEN=Net.iNeuronQTY[0];//rSes.rLearn->iInputQty+1;
//for (int i=0; i<INROWLEN; ++i){
for (long offset=0; (index =offset+tIx)< INROWLEN ; offset+=lTotalThreads){ // index stands for i
Signals[Net.iNeuronOfst[0]+index]= XInputs[IDX2C( index, s, INROWLEN )];
}
/*! middle and top layers. */
for (int L=1; L<Net.iLayerQty; ++L){
//struct rohanLayer& lay = Net.rLayer[L];
long LAY=L;
int TRIB=L-1; // index of previous layer
int iNeuronQTY=Net.iNeuronQTY[LAY];
int iSignalQTY=Net.iDendrtQTY[LAY]; // signal qty depends on size of previous layer
//for (int k=0; k<iNeuronQTY; ++k){ //Neuron zero is not skipped, its output should be 1+0i as a check
for (long offset=0; (kindex =offset+tIx)< iNeuronQTY ; offset+=lTotalThreads){ // kindex stands for k
Zs[Net.iNeuronOfst[LAY]+kindex]=cdcZero;
for (int i=0; i<iSignalQTY; ++i){ //walk weights on inputs from previous layer
Zs[Net.iNeuronOfst[LAY]+kindex] =
CxAddCxUT( Zs[Net.iNeuronOfst[LAY]+kindex] ,
CxMultiplyCxUT(
Wt[IDX2C( Net.iWeightOfst[LAY] + i, kindex, iSignalQTY )],
Signals[Net.iNeuronOfst[TRIB]+i] ) ) ;
}
// ACTIVATE //
Signals[Net.iNeuronOfst[LAY]+kindex] = CxActivateUT( Zs[Net.iNeuronOfst[LAY]+kindex]);
}
}
/*! last layer values are converted and stored here */
long TOP = Net.iLayerQty-1;
long OUTROWLEN=Net.iNeuronQTY[TOP];
//for (int i=0; i<Net.iNeuronQTY[TOP]; ++i){ // continuous conversion begins here
for (long offset=0; (index =offset+tIx)< OUTROWLEN ; offset+=lTotalThreads){ // index stands for i
YEval[IDX2C( index, s, OUTROWLEN )]= Signals[Net.iNeuronOfst[TOP]+index] ; // store final complex output(s)
dYEval[IDX2C( index, s, OUTROWLEN )]=FUnitCxUT( YEval[IDX2C( index, s, OUTROWLEN )] ) * Net.iSectorQty; // convert final complex outputs to sectors and store that
if(devLearn.iContOutputs==false) // round off decimal if disc activation is set
dYEval[IDX2C( index, s, OUTROWLEN )]=int(dYEval[IDX2C( index, s, OUTROWLEN )]);
}
/*! end of sample evaluation. */
}
__device__ cuDoubleComplex CxActivateUT(const cuDoubleComplex Z)
{/// applies ContActivation or discrete activation function to cx neuron output and returns Phi(Z)
/// This fn should be phased out in favor of a GPU device vector based fn
cuDoubleComplex phi;
if (devNet.bContActivation) { // apply ContActivation activation function to weighted sum : phi(z)=z/|z|
phi = CxDivideRlUT( Z, CxAbsUT( Z ) );
}
else { // apply Discrete activation function to weighted sum : s=int(arctan(z)*k/2pi), phi(z)=(X(s),Y(s))
double theta = atan2(Z.y, Z.x); // theta = arctan y/x
int iSector = (int)((theta * devNet.dK_DIV_TWO_PI) + devNet.iSectorQty) % devNet.iSectorQty;
phi = devNet.gpuSectorBdry[iSector];
//printf(" %f+%fi %d Activate\n", phi.x, phi.y, threadIdx.x);
}
return phi;
}
答案 0 :(得分:0)
所以,我不打算阅读所有代码,但我可以给你一个强烈的暗示。 warp大小是32个线程,因此64线程的情况下将运行两个warp / block - 在前一种情况下你不能有任何基于指令指针的竞争条件,但是,在第二种情况下,你将有效地拥有两个组在不同时间安排的具有不同IP的线程。你可能已经知道了很多(因此同义词),但上面的确让你几乎可以肯定你还有一个你尚未考虑的竞争条件。
放入同步线程是尝试隔离它的良好开端。你确定在你的循环中,一个warp的源数据不会被另一个warp覆盖吗?如果不是,请尝试将syncthreads放入内部循环中,仅用于调试目的,以查看可能导致竞争条件的原因。