我一直在尝试实现自己的(简单)布隆过滤器,但我坚持使用散列,我理解多次散列项目的概念并用索引填充位数组。
然而,我在哈希看到了大量的碰撞,我正在使用1种哈希算法(我尝试过FNV,murmurhash和现在的farmhash)和各种种子(基于当前纳秒)。
我一定做错了,我正按照information here计算k
函数并设置相同数量的种子。
任何帮助都会很棒,谢谢。
const farmhash = require('farmhash');
class BloomFilter {
constructor(items, input)
{
const BITS_PER_ITEM = 15; //~0.1% false positive rate
this.m = Buffer.alloc(items.length * BITS_PER_ITEM); // setup bit array
this.k = Math.ceil(BITS_PER_ITEM * 0.7); // amount of hash functions we need to use
this.seeds = [];
this.input = input;
this.items = items;
this.setSeeds();
this.insertItems();
}
get time()
{
let hrTime = process.hrtime()
return hrTime[1];
}
setSeeds()
{
for(let i = 0; i <= this.k; i++) this.seeds.push(this.time);
}
insertItems()
{
console.log('Total buffer size: ' + this.m.length);
let collisions = 0;
this.items.forEach(value => {
this.getBufferIndices(value).map(index => {
if(this.m[index] === 1) collisions++;
this.m[index] = 1;
});
});
console.log('Total collisions: ' + collisions);
}
getBufferIndices(value)
{
let indicies = [];
this.seeds.forEach(seed => indicies.push(farmhash.hash32WithSeed(value, seed) % this.m.length));
return indicies;
}
}
module.exports = BloomFilter;
&#13;
答案 0 :(得分:1)
根据我记得的布隆过滤器,当特定值的所有 k
索引与不同值的索引匹配时,就会发生冲突。
看起来您计算的单个存储桶(this.m[index]
)之前已被设置为冲突。
以下(未经测试的)代码应计算实际的冲突:
let collisions = 0;
this.items.forEach(value => {
let overlap = 0;
this.getBufferIndices(value).map(index => {
if(this.m[index] === 1) overlap++;
this.m[index] = 1;
});
if (overlap === this.k) collisions++;
});
正如@Thomas在评论中正确指出的那样,你应该使用.map()
来代替使用.forEach()
(创建新数组):
this.getBufferIndices(value).forEach(index, ...);
在getBufferIndices()
中,您可以使用.map()
代替.forEach()
:
getBufferIndices(value) {
return this.seeds.map(seed => (farmhash.hash32WithSeed(value, seed) % this.m.length));
}