哈希Hash-平芜编程栈

哈希表的实现：

哈希概念：

哈希(hash)⼜称散列，是⼀种组织数据的⽅式。从译名来看，有散乱排列的意思。本质就是通过哈希函数把关键字Key跟存储位置建⽴⼀个映射关系，查找时通过这个哈希函数计算出Key存储的位置，进⾏快速查找。

直接定址法：

当关键字的范围⽐较集中时，直接定址法就是⾮常简单⾼效的⽅法，⽐如⼀组关键字都在[0,99]之间，那么我们开⼀个100个数的数组，每个关键字的值直接就是存储位置的下标。再⽐如⼀组关键字值都在[a,z]的⼩写字⺟，那么我们开⼀个26个数的数组，每个关键字acsii码-a ascii码就是存储位置的下标。也就是说直接定址法本质就是⽤关键字计算出⼀个绝对位置或者相对位置。

哈希冲突：

直接定址法的缺点也⾮常明显，当关键字的范围⽐较分散时，就很浪费内存甚⾄内存不够⽤。假设我们只有数据范围是[0, 9999]的N个值，我们要映射到⼀个M个空间的数组中(⼀般情况下M >= N)，那么就要借助哈希函数(hash function)hf，关键字key被放到数组的h(key)位置，这⾥要注意的是h(key)计算出的值必须在[0, M)之间。这⾥存在的⼀个问题就是，两个不同的key可能会映射到同⼀个位置去，这种问题我们叫做哈希冲突，或者哈希碰撞。理想情况是找出⼀个好的哈希函数避免冲突，但是实际场景中，冲突是不可避免的，所以我们尽可能设计出优秀的哈希函数，减少冲突的次数，同时也要去设计出解决冲突的⽅案。

负载因⼦：

假设哈希表中已经映射存储了N个值，哈希表的⼤⼩为M，那么，负载因⼦有些地⽅也翻译为载荷因⼦/装载因⼦等，他的英⽂为load factor。负载因⼦越⼤，哈希冲突的概率越⾼，空间利⽤率越⾼；负载因⼦越⼩，哈希冲突的概率越低，空间利⽤率越低；

哈希函数：

⼀个好的哈希函数应该让N个关键字被等概率的均匀的散列分布到哈希表的M个空间中，但是实际中却很难做到，但是我们要尽量往这个⽅向去考量设计

除法散列法/除留余数法

•

除法散列法也叫做除留余数法，顾名思义，假设哈希表的⼤⼩为M，那么通过key除以M的余数作为映射位置的下标，也就是哈希函数为：h(key) = key % M。

•

当使⽤除法散列法时，要尽量避免M为某些值，如2的幂，10的幂等。如果是，那么key %本质相当于保留key的后X位，那么后x位相同的值，计算出的哈希值都是⼀样的，就冲突了。如：

{63 , 31}看起来没有关联的值，如果M是16，也就是，那么计算出的哈希值都是15，因为63的⼆进制后8位是 00111111，31的⼆进制后8位是 00011111。如果是，就更明显了，保留的都是10进值的后x位，如：{112, 12312}，如果M是100，也就是，那么计算出的哈希值都是12。

•

当使⽤除法散列法时，建议M取不太接近2的整数次幂的⼀个质数(素数)。

处理哈希冲突：

线性探测•

从发⽣冲突的位置开始，依次线性向后探测，直到寻找到下⼀个没有存储数据的位置为⽌，如果⾛

到哈希表尾，则回绕到哈希表头的位置。

•

h(key) =hash0 =key%M，hash0位置冲突了，则线性探测公式为：hc(key,i) =hashi= (hash0 +i) %M，i= {1, 2, 3, ...,M− 1}

因为负载因⼦⼩于1，则最多探测M-1次，⼀定能找到⼀个存储key的位置。

开放地址法实现哈希表：

namespace open_address { enum State { EXIST, EMPTY, DELETE }; template<class K, class V> struct HashData { pair<K, V> _kv; State _state = EMPTY; }; template<class K, class V, class Hash = HashFunc<K>> class HashTable { public: inline unsigned long __stl_next_prime(unsigned long n) { // Note: assumes long is at least 32 bits. static const int __stl_num_primes = 28; static const unsigned long __stl_prime_list[__stl_num_primes] = { 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741, 3221225473, 4294967291 }; const unsigned long* first = __stl_prime_list; const unsigned long* last = __stl_prime_list + __stl_num_primes; const unsigned long* pos = lower_bound(first, last, n); return pos == last ? *(last - 1) : *pos; } HashTable() { _tables.resize(__stl_next_prime(0)); } bool Insert(const pair<K, V>& kv) { if (Find(kv.first)) return false; // 负载因⼦⼤于0.7就扩容 if (_n * 10 / _tables.size() >= 7) { // 这⾥利⽤类似深拷⻉现代写法的思想插⼊后交换解决 HashTable<K, V, Hash> newHT; newHT._tables.resize(__stl_next_prime(_tables.size()+1)); for (size_t i = 0; i < _tables.size(); i++) { if (_tables[i]._state == EXIST) { newHT.Insert(_tables[i]._kv); } } _tables.swap(newHT._tables); } Hash hash; size_t hash0 = hash(kv.first) % _tables.size(); size_t hashi = hash0; size_t i = 1; while (_tables[hashi]._state == EXIST) { // 线性探测 hashi = (hash0 + i) % _tables.size(); // ⼆次探测就变成 +- i^2 ++i; } _tables[hashi]._kv = kv; _tables[hashi]._state = EXIST; ++_n; return true; } HashData<K, V>* Find(const K& key) { Hash hash; size_t hash0 = hash(key) % _tables.size(); size_t hashi = hash0; size_t i = 1; while (_tables[hashi]._state != EMPTY) { if (_tables[hashi]._state == EXIST && _tables[hashi]._kv.first == key) { return &_tables[hashi]; } // 线性探测 hashi = (hash0 + i) % _tables.size(); ++i; } return nullptr; } bool Erase(const K& key) { HashData<K, V>* ret = Find(key); if (ret == nullptr) { return false; } else { ret->_state = DELETE; --_n; return true; } } private: vector<HashData<K, V>> _tables; size_t _n = 0; // 表中存储数据个数 }; }

链地址法：

解决冲突的思路：

开放定址法中所有的元素都放到哈希表⾥，链地址法中所有的数据不再直接存储在哈希表中，哈希表中存储⼀个指针，没有数据映射这个位置时，这个指针为空，有多个数据映射到这个位置时，我们把这些冲突的数据链接成⼀个链表，挂在哈希表这个位置下⾯，链地址法也叫做拉链法或者哈希桶。

链地址法代码实现：

namespace hash_bucket { template<class K, class V> struct HashNode { pair<K, V> _kv; HashNode<K, V>* _next; HashNode(const pair<K, V>& kv) :_kv(kv) ,_next(nullptr) {} }; template<class K, class V, class Hash = HashFunc<K>> class HashTable { typedef HashNode<K, V> Node; inline unsigned long __stl_next_prime(unsigned long n) { static const int __stl_num_primes = 28; static const unsigned long __stl_prime_list[__stl_num_primes] = { 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741, 3221225473, 4294967291 }; const unsigned long* first = __stl_prime_list; const unsigned long* last = __stl_prime_list + __stl_num_primes; const unsigned long* pos = lower_bound(first, last, n); return pos == last ? *(last - 1) : *pos; } public: HashTable() { _tables.resize(__stl_next_prime(0), nullptr); } // 拷⻉构造和赋值拷⻉需要实现深拷⻉，有兴趣的同学可以⾃⾏实现 ~HashTable() { // 依次把每个桶释放 for (size_t i = 0; i < _tables.size(); i++) { Node* cur = _tables[i]; while (cur) { Node* next = cur->_next; delete cur; cur = next; } _tables[i] = nullptr; } } bool Insert(const pair<K, V>& kv) { Hash hs; size_t hashi = hs(kv.first) % _tables.size(); // 负载因⼦==1扩容 if (_n == _tables.size()) { /*HashTable<K, V> newHT; newHT._tables.resize(__stl_next_prime(_tables.size()+1); for (size_t i = 0; i < _tables.size(); i++) { Node* cur = _tables[i]; while(cur) { newHT.Insert(cur->_kv); cur = cur->_next; } } _tables.swap(newHT._tables);*/ // 这⾥如果使⽤上⾯的⽅法，扩容时创建新的结点，后⾯还要使⽤就结 点，浪费了 // 下⾯的⽅法，直接移动旧表的结点到新表，效率更好 vector<Node*> newtables(__stl_next_prime(_tables.size()+1), nullptr); for (size_t i = 0; i < _tables.size(); i++) { Node* cur = _tables[i]; while (cur) { Node* next = cur->_next; // 旧表中节点，挪动新表重新映射的位置 size_t hashi = hs(cur->_kv.first) % newtables.size(); // 头插到新表 cur->_next = newtables[hashi]; newtables[hashi] = cur; cur = next; } _tables[i] = nullptr; } _tables.swap(newtables); } // 头插 Node* newnode = new Node(kv); newnode->_next = _tables[hashi]; _tables[hashi] = newnode; ++_n; return true; } Node* Find(const K& key) { Hash hs; size_t hashi = hs(key) % _tables.size(); Node* cur = _tables[hashi]; while (cur) { if (cur->_kv.first == key) { return cur; } cur = cur->_next; } return nullptr; } bool Erase(const K& key) { Hash hs; size_t hashi = hs(key) % _tables .size(); Node* prev = nullptr; Node* cur = _tables[hashi]; while (cur) { if (cur->_kv.first == key) { if (prev == nullptr) { _tables[hashi] = cur->_next; } else { prev->_next = cur->_next; } delete cur; --_n; return true; } prev = cur; cur = cur->_next; } return false; } private: vector<Node*> _tables; // 指针数组 size_t _n = 0; // 表中存储数据个数 }; }

⽤哈希表封装myunordered_map和 myunordered_set

模拟实现unordered_map和unordered_set

#pragma once #include"Hashtable.h" // MyUnorderedMap.h namespace map { template<class K, class V, class Hash = HashFunc<K>> class unordered_map { struct MapKeyOfT { const K& operator()(const pair<K, V>& kv) { return kv.first; } }; public: typedef typename hash_bucket::HashTable<K, pair<const K, V>, MapKeyOfT, Hash>::Iterator iterator; typedef typename hash_bucket::HashTable<K, pair<const K, V>, MapKeyOfT, Hash>::ConstIterator const_iterator; iterator begin() { return _ht.Begin(); } iterator end() { return _ht.End(); } const_iterator begin() const { return _ht.Begin(); } const_iterator end() const { return _ht.End(); } V& operator[](const K& key) { pair<iterator, bool> ret = insert({ key, V() }); return ret.first->second; } pair<iterator, bool> insert(const pair<K, V>& kv) { return _ht.Insert(kv); } iterator Find(const K& key) { return _ht.Find(key); } bool Erase(const K& key) { return _ht.Erase(key); } private: hash_bucket::HashTable<K, pair<const K, V>, MapKeyOfT, Hash> _ht; }; void test_map1() { unordered_map<string, string> dict; dict.insert({ "sort", "排序" }); dict.insert({ "字符串", "string" }); dict.insert({ "sort", "排序" }); dict.insert({ "left", "左边" }); dict.insert({ "right", "右边" }); dict["left"] = "左边，剩余"; dict["insert"] = "插入"; dict["string"]; for (auto& kv : dict) { cout << kv.first << ":" << kv.second << endl; } cout << endl; unordered_map<string, string>::iterator it = dict.begin(); while (it != dict.end()) { // 不能修改first，可以修改second //it->first += 'x'; it->second += 'x'; cout << it->first << ":" << it->second << endl; ++it; } cout << endl; } }

#pragma once #include"Hashtable.h" namespace set { template<class K, class Hash = HashFunc<K>> class unordered_set { struct SetKeyOfT { const K& operator()(const K& key) { return key; } }; public: typedef typename hash_bucket::HashTable<K, const K, SetKeyOfT, Hash>::Iterator iterator; typedef typename hash_bucket::HashTable<K, const K, SetKeyOfT, Hash>::ConstIterator const_iterator; iterator begin() { return _ht.Begin(); } iterator end() { return _ht.End(); } const_iterator begin() const { return _ht.Begin(); } const_iterator end() const { return _ht.End(); } pair<iterator, bool> insert(const K& key) { return _ht.Insert(key); } iterator Find(const K& key) { return _ht.Find(key); } bool Erase(const K& key) { return _ht.Erase(key); } private: hash_bucket::HashTable<K, const K, SetKeyOfT, Hash> _ht; }; void print(const unordered_set<int>& s) { unordered_set<int>::const_iterator it = s.begin(); while (it != s.end()) { //*it = 1; cout << *it << " "; ++it; } cout << endl; for (auto e : s) { cout << e << " "; } cout << endl; } void test_set1() { int a[] = { 3,11,86,7,88,82,1,881,5,6,7,6 }; unordered_set<int> s; for (auto e : a) { s.insert(e); } unordered_set<int>::iterator it = s.begin(); while (it != s.end()) { //*it = 1; cout << *it << " "; ++it; } cout << endl; for (auto e : s) { cout << e << " "; } cout << endl; print(s); } }

⽀持iterator的实现

#pragma once #include <vector> #include <iostream> #include <string> using namespace std; // 请完成哈希表的如下操作 // 哈希函数采用除留余数法 //template<class K> //struct HashFunc //{ // size_t operator()(const K& key) // { // return (size_t)key; // } //}; // //// 哈希表中支持字符串的操作 //template<> //struct HashFunc<string> //{ // size_t operator()(const string& key) // { // size_t hash = 0; // for (auto e : key) // { // hash *= 31; // hash += e; // } // // return hash; // } //}; // 以下采用开放定址法，即线性探测解决冲突 enum State { EXIST, EMPTY, DELETE }; template<class K, class V> struct HashData { pair<K, V> _kv; State _state = EMPTY; }; template<class K> struct HashFunc { size_t operator()(const K& key) { return (size_t)key; } }; template<> struct HashFunc<string> { size_t operator()(const string& s) { // 补充：常用的BKDR字符串哈希算法（经典且高效） size_t hash = 0; for (char ch : s) { hash = hash + ch; hash = hash * 131;// 131是经验质数，也可用31/13131等 } return hash; } }; inline unsigned long _stl_next_prime(unsigned long n) { // Note: assumes long is at least 32 bits. static const int __stl_num_primes = 28; static const unsigned long __stl_prime_list[__stl_num_primes] = { 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741, 3221225473, 4294967291 }; const unsigned long* first = __stl_prime_list; const unsigned long* last = __stl_prime_list + __stl_num_primes; const unsigned long* pos = lower_bound(first, last, n); return pos == last ? *(last - 1) : *pos; } namespace open_address { template<class K, class V, class Hash = HashFunc<K>> class HashTable { public: HashTable() :_tables(_stl_next_prime(0)) , _n(0) {} bool Insert(const pair<K, V>& kv) { if (Find(kv.first)) return false; // 负载因子 >= 0.7扩容 if (_n * 10 / _tables.size() >= 7) { //vector<HashData<K, V>> newtables(_tables.size()*2); //for (auto& data : _tables) //{ // // 旧表的数据映射到新表 // if (data._state == EXIST) // { // size_t hash0 = data._kv.first % newtables.size(); // // ... // } //} //_tables.swap(newtables); HashTable<K, V, Hash> newht; //newht._tables.resize(_tables.size() * 2); newht._tables.resize(__stl_next_prime(_tables.size() + 1)); for (auto& data : _tables) { // 旧表的数据映射到新表 if (data._state == EXIST) { newht.Insert(data._kv); } } _tables.swap(newht._tables); } Hash hash; size_t hash0 = hash(kv.first) % _tables.size(); size_t hashi = hash0; size_t i = 1; int flag = 1; while (_tables[hashi]._state == EXIST) { // 线性探测 hashi = (hash0 + i) % _tables.size(); ++i; /*hashi = (hash0 + (i*i*flag)) % _tables.size(); if (hashi < _tables.size()) hashi += _tables.size(); if (flag == 1) { flag = -1; } else { ++i; flag = 1; }*/ } _tables[hashi]._kv = kv; _tables[hashi]._state = EXIST; ++_n; return true; } HashData<K, V>* Find(const K& key) { Hash hash; size_t hash0 = hash(key) % _tables.size(); size_t hashi = hash0; size_t i = 1; while (_tables[hashi]._state != EMPTY) { if (_tables[hashi]._state == EXIST && _tables[hashi]._kv.first == key) { return &_tables[hashi]; } // 线性探测 hashi = (hash0 + i) % _tables.size(); ++i; } return nullptr; } bool Erase(const K& key) { HashData<K, V>* ret = Find(key); if (ret) { ret->_state = DELETE; return true; } else { return false; } } private: vector<HashData<K, V>> _tables; size_t _n; // 记录数据个数 }; } namespace hash_bucket { template<class T> struct HashNode { T _data; HashNode<T>* _next; HashNode(const T& data) :_data(data) , _next(nullptr) {} }; template<class K, class T, class KeyOfT, class Hash> class HashTable; template<class K, class T, class Ref, class Ptr, class KeyOfT, class Hash> struct HTIterator { typedef HashNode<T> Node; typedef HashTable<K, T, KeyOfT, Hash> HT; typedef HTIterator<K, T, Ref, Ptr, KeyOfT, Hash> Self; Node* _node; const HT* _ht; HTIterator(Node* node, const HT* ht) :_node(node) , _ht(ht) {} Ref operator*() { return _node->_data; } Ptr operator->() { return &_node->_data; } bool operator!=(const Self& s) { return _node != s._node; } Self& operator++() { if (_node->_next) { _node = _node->_next; } else { KeyOfT kot; Hash hash; size_t hashi = hash(kot(_node->_data)) % _ht->_tables.size(); ++hashi; while (hashi < _ht->_tables.size()) { _node = _ht->_tables[hashi]; if (_node) break; else ++hashi; } // 所有桶都走完了，end()给的空标识的_node if (hashi == _ht->_tables.size()) { _node = nullptr; } } return *this; } }; // K 为 T 中key的类型 // T 可能是键值对，也可能是K // KeyOfT: 从T中提取key // Hash将key转化为整形，因为哈市函数使用除留余数法 template<class K, class T, class KeyOfT, class Hash> class HashTable { template<class K, class T, class Ref, class Ptr, class KeyOfT, class Hash> friend struct HTIterator; typedef HashNode<T> Node; public: typedef HTIterator<K, T, T&, T*, KeyOfT, Hash> Iterator; typedef HTIterator<K, T, const T&, const T*, KeyOfT, Hash> ConstIterator; HashTable() :_tables(_stl_next_prime(0)) , _n(0) {} Iterator Begin() { if (_n == 0) return End(); for (size_t i = 0; i < _tables.size(); i++) { Node* cur = _tables[i]; if (cur) { return Iterator(cur, this); } } return End(); } Iterator End() { return Iterator(nullptr, this); } ConstIterator Begin() const { if (_n == 0) return End(); for (size_t i = 0; i < _tables.size(); i++) { Node* cur = _tables[i]; if (cur) { return ConstIterator(cur, this); } } return End(); } ConstIterator End() const { return ConstIterator(nullptr, this); } // 哈希桶的销毁 ~HashTable() { for (size_t i = 0; i < _tables.size(); i++) { Node* cur = _tables[i]; while (cur) { Node* next = cur->_next; delete cur; cur = next; } _tables[i] = nullptr; } } // 插入值为data的元素，如果data存在则不插入 pair<Iterator, bool> Insert(const T& data) { KeyOfT kot; Iterator it = Find(kot(data)); if (it != End()) return { it, false }; Hash hash; if (_n == _tables.size()) { vector<Node*> newtable(_stl_next_prime(_tables.size() + 1)); for (size_t i = 0; i < _tables.size(); i++) { Node* cur = _tables[i]; while (cur) { Node* next = cur->_next; size_t hashi = hash(kot(cur->_data)) % newtable.size(); cur->_next = newtable[hashi]; newtable[hashi] = cur; cur = next; } } _tables.swap(newtable); } size_t hashi = hash(kot(data)) % _tables.size(); // 头插 Node* newnode = new Node(data); newnode->_next = _tables[hashi]; _tables[hashi] = newnode; ++_n; return { Iterator(newnode, this), false }; } // 在哈希桶中查找值为key的元素，存在返回true否则返回false? Iterator Find(const K& key) { KeyOfT kot; Hash hash; size_t hashi = hash(key) % _tables.size(); Node* cur = _tables[hashi]; while (cur) { if (kot(cur->_data) == key) { return Iterator(cur, this); } cur = cur->_next; } return End(); } // 哈希桶中删除key的元素，删除成功返回true，否则返回false bool Erase(const K& key) { KeyOfT kot; size_t hashi = key % _tables.size(); Node* prev = nullptr; Node* cur = _tables[hashi]; while (cur) { if (kot(cur->_data) == key) { if (prev == nullptr) { // 头结点 _tables[hashi] = cur->_next; } else { // 中间节点 prev->_next = cur->_next; } delete cur; --_n; return true; } else { prev = cur; cur = cur->_next; } } return false; } private: vector<Node*> _tables; // 指针数组 size_t _n = 0; // 表中存储数据个数 }; }