布隆过滤器

末蓝、 2022-06-12 22:14 302阅读 0赞

布隆过滤器实际上就是哈希和位图的结合  
它的优点：速度快并且节省空间  
它的缺点：存在误判（比如存在不同的字符串可能存在相同的ASCII，这样我们在判断的时候就会出现误判）  
这样的误判一定是发生在 判断它存在的情况下  
误判一定不会发生在 不存在的情况下  
为了降低误判率，我们必须尽可能的减少哈希冲突  
也就是一个Key值可以有多个映射的位置

#include<iostream>
    #include<stdlib.h>
    #include<string>
    #include"BitMap.h"
    using namespace std;
    template<class K>
    struct _HashFunc1
    {
        size_t BKDRHash(const char *str)
        {
            register size_t hash = 0;
            while (size_t ch = (size_t)*str++)
            {
                hash = hash * 131 + ch;   // 也可以乘以31、131、1313、13131、131313.. 
    
            }
            return hash;
        }
    
        size_t operator()(const string &s)
        {
            return BKDRHash(s.c_str());
        }
    };
    
    template<class K>
    struct _HashFunc2
    {
        size_t SDBMHash(const char *str)
        {
            register size_t hash = 0;
            while (size_t ch = (size_t)*str++)
            {
                hash = 65599 * hash + ch;
                //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash; 
            }
            return hash;
        }
    
        size_t operator()(const string &s)
        {
            return SDBMHash(s.c_str());
        }
    };
    
    template<class K>
    struct _HashFunc3
    {
        size_t RSHash(const char *str)
        {
            if (!*str)        // 这是由本人添加，以保证空字符串返回哈希值0 
                return 0;
            register size_t hash = 1315423911;
            while (size_t ch = (size_t)*str++)
            {
                hash ^= ((hash << 5) + ch + (hash >> 2));
            }
            return hash;
        }
    
        size_t operator()(const string &s)
        {
            return RSHash(s.c_str());
        }
    };
    
    template<class K>
    struct _HashFunc4
    {
        size_t RSHash(const char *str)
        {
            register size_t hash = 0;
            size_t magic = 63689;
            while (size_t ch = (size_t)*str++)
            {
                hash = hash * magic + ch;
                magic *= 378551;
            }
            return hash;
        }
    
        size_t operator()(const string&s)
        {
            return RSHash(s.c_str());
        }
    };
    
    template<class K>
    struct _HashFunc5
    {
        size_t RSHash(const char *str)
        {
            register size_t hash = 0;
            size_t ch;
            for (long i = 0; ch = (size_t)*str++; i++)
            {
                if ((i & 1) == 0)
                {
                    hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
                }
                else
                {
                    hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
                }
            }
            return hash;
        }
    
        size_t operator()(const string &s)
        {
            return RSHash(s.c_str());
        }
    };
    template<class K=string,typename HashFun1=_HashFunc1<K>,
        typename HashFun2 = _HashFunc2<K>,
        typename HashFun3 = _HashFunc3<K>,
        typename HashFun4 = _HashFunc4<K>,
        typename HashFun5 = _HashFunc5<K>>
    class BloomFiler
    {
    public:
        BloomFiler(size_t n)
            :_bp(n*5*2)
            , _range(n*5*2)
        {
    
        }
        void Set(const K&key)
        {
            size_t Hash1 = HashFun1()(key);
            size_t Hash2 = HashFun2()(key);
            size_t Hash3 = HashFun3()(key);
            size_t Hash4 = HashFun4()(key);
            size_t Hash5 = HashFun5()(key);
            _bp.Set(Hash1%_range);
            _bp.Set(Hash2%_range);
            _bp.Set(Hash3%_range);
            _bp.Set(Hash4%_range);
            _bp.Set(Hash5%_range);
        }
        bool Test(const K&key)
        {
            size_t Hash1 = HashFun1()(key);
            if (_bp.Test(Hash1%_range) == false)
            {
                return false;
            }
            size_t Hash2 = HashFun2()(key);
            if (_bp.Test(Hash2%_range) == false)
            {
                return false;
            }
            size_t Hash3 = HashFun3()(key);
            if (_bp.Test(Hash3%_range) == false)
            {
                return false;
            }
            size_t Hash4 = HashFun4()(key);
            if (_bp.Test(Hash4%_range) == false)
            {
                return false;
            }
            size_t Hash5 = HashFun5()(key);
            if (_bp.Test(Hash5%_range) == false)
            {
                return false;
            }
            return true;
        }
    protected:
        BitMap _bp;
        size_t _range;
    };
    int main()
    {
        BloomFiler<>bf(500);
        string s1 = "child";
        string s2 = "qqild";
        string s3 = "eeild";
        string s4 = "ddild";
        bf.Set(s1);
        bf.Set(s2);
        bf.Set(s3);
        bf.Set(s4);
        cout<<bf.Test(s1)<<endl;
        cout << bf.Test(s2) << endl;
        cout << bf.Test(s3) << endl;
        cout << bf.Test(s4) << endl;
        system("pause");
        return 0;
    }

但是这样的布隆过滤器不支持删除操作，因为可能会影响其他位置上的元素  
因此为了支持删除操作将其修改成为引用计数版本的但是这样做必须要来维护引用计数，使用数组来存放引用计数，不再涉及位运算，因此这样做实际上去除了布隆过滤器原本节省空间的优势。

#include<iostream>
    #include<stdlib.h>
    #include<vector>
    #include<string>
    #include"BitMap.h"
    using namespace std;
    template<class K>
    struct _HashFunc1
    {
        size_t BKDRHash(const char *str)
        {
            register size_t hash = 0;
            while (size_t ch = (size_t)*str++)
            {
                hash = hash * 131 + ch;   // 也可以乘以31、131、1313、13131、131313.. 
    
            }
            return hash;
        }
    
        size_t operator()(const string &s)
        {
            return BKDRHash(s.c_str());
        }
    };
    
    template<class K>
    struct _HashFunc2
    {
        size_t SDBMHash(const char *str)
        {
            register size_t hash = 0;
            while (size_t ch = (size_t)*str++)
            {
                hash = 65599 * hash + ch;
                //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash; 
            }
            return hash;
        }
    
        size_t operator()(const string &s)
        {
            return SDBMHash(s.c_str());
        }
    };
    
    template<class K>
    struct _HashFunc3
    {
        size_t RSHash(const char *str)
        {
            if (!*str)        // 这是由本人添加，以保证空字符串返回哈希值0 
                return 0;
            register size_t hash = 1315423911;
            while (size_t ch = (size_t)*str++)
            {
                hash ^= ((hash << 5) + ch + (hash >> 2));
            }
            return hash;
        }
    
        size_t operator()(const string &s)
        {
            return RSHash(s.c_str());
        }
    };
    
    template<class K>
    struct _HashFunc4
    {
        size_t RSHash(const char *str)
        {
            register size_t hash = 0;
            size_t magic = 63689;
            while (size_t ch = (size_t)*str++)
            {
                hash = hash * magic + ch;
                magic *= 378551;
            }
            return hash;
        }
    
        size_t operator()(const string&s)
        {
            return RSHash(s.c_str());
        }
    };
    
    template<class K>
    struct _HashFunc5
    {
        size_t RSHash(const char *str)
        {
            register size_t hash = 0;
            size_t ch;
            for (long i = 0; ch = (size_t)*str++; i++)
            {
                if ((i & 1) == 0)
                {
                    hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
                }
                else
                {
                    hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
                }
            }
            return hash;
        }
    
        size_t operator()(const string &s)
        {
            return RSHash(s.c_str());
        }
    };
    template<class K = string, typename HashFun1 = _HashFunc1<K>,
        typename HashFun2 = _HashFunc2<K>,
        typename HashFun3 = _HashFunc3<K>,
        typename HashFun4 = _HashFunc4<K>,
        typename HashFun5 = _HashFunc5<K >>
    class BloomFilerRef
    {
    public:
        BloomFilerRef(const int&n)
            : _range(n * 5 * 2)
        {
            _v.resize(n * 5 * 2);
        }
        void Set(const K&key)
        {
            size_t Hash1 = HashFun1()(key)%_range;
            _v[Hash1]++;
            size_t Hash2 = HashFun2()(key) % _range;
            _v[Hash2]++;
            size_t Hash3 = HashFun3()(key) % _range;
            _v[Hash3]++;
            size_t Hash4 = HashFun4()(key) % _range;
            _v[Hash4]++;
            size_t Hash5 = HashFun5()(key) % _range;
            _v[Hash5]++;
        }
        void Reset(const K&key)
        {
            size_t Hash1 = HashFun1()(key) % _range;
            _v[Hash1]--;
            size_t Hash2 = HashFun2()(key) % _range;
            _v[Hash2]--;
            size_t Hash3 = HashFun3()(key) % _range;
            _v[Hash3]--;
            size_t Hash4 = HashFun4()(key) % _range;
            _v[Hash4]--;
            size_t Hash5 = HashFun5()(key) % _range;
            _v[Hash5]--;
        }
        bool Test(const K&key)
        {
            size_t Hash1 = HashFun1()(key) % _range;
            if (_v[Hash1] == false)
            {
                return false;
            }
            size_t Hash2 = HashFun1()(key) % _range;
            if (_v[Hash2] == false)
            {
                return false;
            }
            size_t Hash3 = HashFun1()(key) % _range;
            if (_v[Hash3] == false)
            {
                return false;
            }
            size_t Hash4 = HashFun1()(key) % _range;
            if (_v[Hash4] == false)
            {
                return false;
            }
            size_t Hash5 = HashFun1()(key) % _range;
            if (_v[Hash5] == false)
            {
                return false;
            }
            return true;
        }
    protected:
        vector<size_t>_v;
        size_t _range;
    };