如何避免从CUDA函数同时从不同的线程写入?

时间:2015-03-10 11:06:01

标签: c++ cuda synchronization

我的GPU功能有问题,因为结果与串口实现不同。

我认为问题出在函数的最后一行,因为并发线程可以同时修改BFilter的值。

怎么能避免这个?

__device__ void mapBloom(word32 hash, char *(BFilter), int M)
{
    int tempInt;
    int bitNum;
    int byteNum;
    unsigned char mapBit;
    tempInt = hash % M;
    byteNum = tempInt / 8;
    bitNum =  tempInt % 8;

    mapBit = 0x80;
    mapBit = mapBit >> bitNum;

    BFilter[byteNum] = BFilter[byteNum] | mapBit;
}

调用上述函数的内核函数:

__global__ void mapBloomKernel(const char* dev_charTextFromFiles, int *dev_wordIndexes, int *dev_fileIndexes, int* dev_numberOfWords, word32 *CrcTable, word32 crc32, char *BFilter, int *dev_BFStartIndex, int *dev_BF_NumBits)
{

    extern __shared__ int currentBloomFilter[];

    if(threadIdx.x < (dev_BF_NumBits[blockIdx.x]%32==0 ? dev_BF_NumBits[blockIdx.x]/32 : (dev_BF_NumBits[blockIdx.x]/32 + 1)))
        currentBloomFilter[threadIdx.x] = 0;
    __syncthreads();

    unsigned int index = dev_numberOfWords[blockIdx.x] + threadIdx.x + blockIdx.x;

    if(threadIdx.x<dev_numberOfWords[blockIdx.x+1]-dev_numberOfWords[blockIdx.x]-1)
    {

        for(int i=0; i<10; i++)
        {
            crc32 = update_crc(i, dev_charTextFromFiles + dev_wordIndexes[index] + dev_fileIndexes[blockIdx.x], dev_wordIndexes[index+1] - dev_wordIndexes[index], CrcTable);
            mapBloom(crc32, currentBloomFilter, dev_BF_NumBits[blockIdx.x]);
        }
    }
    __syncthreads();
    if(threadIdx.x < (dev_BF_NumBits[blockIdx.x]%32==0 ? dev_BF_NumBits[blockIdx.x]/32 : (dev_BF_NumBits[blockIdx.x]/32 + 1)))
    {
        int n = currentBloomFilter[threadIdx.x];
        BFilter[dev_BFStartIndex[blockIdx.x] + threadIdx.x * 4 + 3] = (n >> 24) & 0xFF;;
        BFilter[dev_BFStartIndex[blockIdx.x] + threadIdx.x * 4 + 2] = (n >> 16) & 0xFF;;
        BFilter[dev_BFStartIndex[blockIdx.x] + threadIdx.x * 4 + 1] = (n >> 8) & 0xFF;
        BFilter[dev_BFStartIndex[blockIdx.x] + threadIdx.x * 4 ] = n & 0xFF;    
    }
}

编辑1:

我考虑将BFilter更改为int而不是char的数组,然后使用AtomicOr。

我修改了mapBloom函数以使用整数Bloom Filter。 这是修改后的功能:

__device__ void mapBloom(word32 hash, int *(BFilter), int M)
{
    int tempInt;
    int bitNum;
    int byteNum;
    unsigned char mapBit;
    tempInt = hash % M;

    byteNum = tempInt / 8;
    bitNum =  tempInt % 8;

    mapBit = 0x80;
    mapBit = mapBit >> bitNum;

    int intNum;
    intNum = tempInt / 32;
    int byteNumInInt = byteNum % 4;

    int mapBitInt = ((int)mapBit)<<(8*byteNumInInt);

    BFilter[intNum] = atomicOr(BFilter + intNum, mapBitInt);

}

串行和并行实现的结果仍然不同。

0 个答案:

没有答案