布隆过滤器哈希返回太多碰撞

时间:2017-03-15 21:38:07

标签: javascript node.js algorithm ecmascript-6 bloom-filter

我一直在尝试实现自己的(简单)布隆过滤器,但我坚持使用散列,我理解多次散列项目的概念并用索引填充位数组。

然而,我在哈希看到了大量的碰撞,我正在使用1种哈希算法(我尝试过FNV,murmurhash和现在的farmhash)和各种种子(基于当前纳秒)。

我一定做错了,我正按照information here计算k函数并设置相同数量的种子。

任何帮助都会很棒,谢谢。



const farmhash = require('farmhash');

class BloomFilter {
	constructor(items, input)
	{
		const BITS_PER_ITEM = 15; //~0.1% false positive rate
		this.m = Buffer.alloc(items.length * BITS_PER_ITEM); // setup bit array
		this.k = Math.ceil(BITS_PER_ITEM * 0.7); // amount of hash functions we need to use
		this.seeds = [];
		this.input = input;
		this.items = items;

		this.setSeeds();
		this.insertItems();
	}

	get time()
	{
		let hrTime = process.hrtime()
		return hrTime[1];
	}

	setSeeds()
	{
		for(let i = 0; i <= this.k; i++) this.seeds.push(this.time);
	}
	
	insertItems()
	{
		console.log('Total buffer size: ' + this.m.length);

		let collisions = 0;
		this.items.forEach(value => {			
			this.getBufferIndices(value).map(index => {
				if(this.m[index] === 1) collisions++;
				this.m[index] = 1;
			});
		});

		console.log('Total collisions: ' + collisions);
	}

	getBufferIndices(value)
	{
		let indicies = [];

		this.seeds.forEach(seed => indicies.push(farmhash.hash32WithSeed(value, seed) % this.m.length));

		return indicies;
	}
}

module.exports = BloomFilter;
&#13;
&#13;
&#13;

1 个答案:

答案 0 :(得分:1)

根据我记得的布隆过滤器,当特定值的所有 k索引与不同值的索引匹配时,就会发生冲突。

看起来您计算的单个存储桶(this.m[index])之前已被设置为冲突。

以下(未经测试的)代码应计算实际的冲突:

let collisions = 0;

this.items.forEach(value => {           
  let overlap = 0;
  this.getBufferIndices(value).map(index => {
    if(this.m[index] === 1) overlap++;
    this.m[index] = 1;
  });
  if (overlap === this.k) collisions++;
});

正如@Thomas在评论中正确指出的那样,你应该使用.map()来代替使用.forEach()(创建新数组):

this.getBufferIndices(value).forEach(index, ...);

getBufferIndices()中,您可以使用.map()代替.forEach()

getBufferIndices(value) {
  return this.seeds.map(seed => (farmhash.hash32WithSeed(value, seed) % this.m.length));
}