我的GPU功能有问题,因为结果与串口实现不同。
我认为问题出在函数的最后一行,因为并发线程可以同时修改BFilter的值。
怎么能避免这个?
__device__ void mapBloom(word32 hash, char *(BFilter), int M)
{
int tempInt;
int bitNum;
int byteNum;
unsigned char mapBit;
tempInt = hash % M;
byteNum = tempInt / 8;
bitNum = tempInt % 8;
mapBit = 0x80;
mapBit = mapBit >> bitNum;
BFilter[byteNum] = BFilter[byteNum] | mapBit;
}
调用上述函数的内核函数:
__global__ void mapBloomKernel(const char* dev_charTextFromFiles, int *dev_wordIndexes, int *dev_fileIndexes, int* dev_numberOfWords, word32 *CrcTable, word32 crc32, char *BFilter, int *dev_BFStartIndex, int *dev_BF_NumBits)
{
extern __shared__ int currentBloomFilter[];
if(threadIdx.x < (dev_BF_NumBits[blockIdx.x]%32==0 ? dev_BF_NumBits[blockIdx.x]/32 : (dev_BF_NumBits[blockIdx.x]/32 + 1)))
currentBloomFilter[threadIdx.x] = 0;
__syncthreads();
unsigned int index = dev_numberOfWords[blockIdx.x] + threadIdx.x + blockIdx.x;
if(threadIdx.x<dev_numberOfWords[blockIdx.x+1]-dev_numberOfWords[blockIdx.x]-1)
{
for(int i=0; i<10; i++)
{
crc32 = update_crc(i, dev_charTextFromFiles + dev_wordIndexes[index] + dev_fileIndexes[blockIdx.x], dev_wordIndexes[index+1] - dev_wordIndexes[index], CrcTable);
mapBloom(crc32, currentBloomFilter, dev_BF_NumBits[blockIdx.x]);
}
}
__syncthreads();
if(threadIdx.x < (dev_BF_NumBits[blockIdx.x]%32==0 ? dev_BF_NumBits[blockIdx.x]/32 : (dev_BF_NumBits[blockIdx.x]/32 + 1)))
{
int n = currentBloomFilter[threadIdx.x];
BFilter[dev_BFStartIndex[blockIdx.x] + threadIdx.x * 4 + 3] = (n >> 24) & 0xFF;;
BFilter[dev_BFStartIndex[blockIdx.x] + threadIdx.x * 4 + 2] = (n >> 16) & 0xFF;;
BFilter[dev_BFStartIndex[blockIdx.x] + threadIdx.x * 4 + 1] = (n >> 8) & 0xFF;
BFilter[dev_BFStartIndex[blockIdx.x] + threadIdx.x * 4 ] = n & 0xFF;
}
}
我考虑将BFilter更改为int而不是char的数组,然后使用AtomicOr。
我修改了mapBloom函数以使用整数Bloom Filter。 这是修改后的功能:
__device__ void mapBloom(word32 hash, int *(BFilter), int M)
{
int tempInt;
int bitNum;
int byteNum;
unsigned char mapBit;
tempInt = hash % M;
byteNum = tempInt / 8;
bitNum = tempInt % 8;
mapBit = 0x80;
mapBit = mapBit >> bitNum;
int intNum;
intNum = tempInt / 32;
int byteNumInInt = byteNum % 4;
int mapBitInt = ((int)mapBit)<<(8*byteNumInInt);
BFilter[intNum] = atomicOr(BFilter + intNum, mapBitInt);
}
串行和并行实现的结果仍然不同。