Hashing

I present here a special implementation of a hash table algorithm. I have found this one to be particularly good since it limits the worst case search behaviour and allows the hash table to fill to nearly 100% with no performance penalty.

It employs a "random walk" algorithm when inserting data that shuffles the entries in the hash table around so that there is only a small number of probes required to find any given entry.

It also has the ability to resize itself when required.

Entries that cannot fit into the hash table because of too many collisions are stored in a linear array. When the linear array gets too full and search performance begins to drop off, the hash table is made larger to accommodate more data.

This code has been compiled and tested with MS VC++ 6.0 and DJGPP 3.04.

You can download the code from http://www.cjkware.com/wamckee/hashing.zip

#include <cstdlib> // for NULL // assumes class T has the following member functions defined: // // T (T & item); // copy constructor // bool operator == (T & rhs); // test for equality (exact match) // int first_hash_key (); // must return a value between 0 and INT_MAX // int next_hash_key (); // must return a value between 0 and INT_MAX // template <class T> class HashTable { private : // number of probes enum { N = 6 }; // actual value may vary ... the smaller the value, // the faster the average search time // if the value is too small, insertion time // goes way up and the maximum fill % goes way down // (N must be >= 2 or the algorithm never converges) enum { NUMBER_OF_TRIES = 100 }; // the bigger the number the higher the maximum % fill but // slower for worst case insertion time // (actual number of times may vary depending on what you // want the worst case insertion time to be) // (the bigger N is the small the number of times you need // to try and visa versa) // (could get bigger as hash_table_size increases) T * * hash_table; int hash_table_size; T * * overflow; int overflow_size; public : HashTable<T> (int starting_size) { if (starting_size < 100) // sanity check starting_size = 100; hash_table = NULL; hash_table_size = 0; overflow = NULL; overflow_size = 0; resize (starting_size); } ~HashTable<T> () { int i; for (i = 0; i < hash_table_size; i++) if (hash_table [i] != NULL) delete hash_table [i]; for (i = 0; i < overflow_size; i++) delete overflow [i]; delete [] hash_table; delete [] overflow; } T * find (T & item); // O(1) average search time T * insert (T & item); // O(1) average insert time void resize (int new_hash_table_size); // O(n) resize time (rarely called) private : void test_for_resize () { if (overflow_size >= 10) // some suitable value to limit // the worst case linear search time resize (hash_table_size * 2); // logrithmic allocation } public : // helper member functions for computing statistics // 100.0 * count () / size () = % full of the hash table // over () = number of entries in linear search int count () { int cnt = 0; for (int i = 0; i < hash_table_size; i++) if (hash_table [i] != NULL) cnt ++; return cnt; } int size () { return hash_table_size; } int over () { return overflow_size; } }; template <class T> T * HashTable<T>::find (T & item) { int i; int probe = item.first_hash_key () % hash_table_size; // if you haven't found it after N probes, you're never going // to find it in the hash table for (i = 0; i < N; i++) { if (hash_table [probe] != NULL && * hash_table [probe] == item) return hash_table [probe]; probe = item.next_hash_key () % hash_table_size; } // check the overflow array for the item (linear search) for (i = 0; i < overflow_size; i++) { if (* overflow [i] == item) return overflow [i]; } return NULL; } // returns a pointer to a copy of item after insertion template <class T> T * HashTable<T>::insert (T & item) { // search for an existing entry, if found, don't insert anything T * first; if ((first = find (item)) != NULL) return first; // make a new entry ready for insertion into the hash table // note: fishy allocation ... lots of little items may cause memory // fragmentation -- favours sparse hash tables // if your planning to fill the hash table, copy the item into // an array of T instead of allocating them one at a time first = new T (item); T * next = first; // try inserting NUMBER_OF_TRIES times for (int limit = 0; limit < NUMBER_OF_TRIES; limit++) { int probe [N]; int i; // calculate the hash probes (entry locations) for // the new item probe [0] = next -> first_hash_key () % hash_table_size; for (i = 1; i < N; i++) probe [i] = next -> next_hash_key () % hash_table_size; // search for an empty entry in the hash table for (i = 0; i < N; i++) { if (hash_table [probe [i]] == NULL) { // if you find an empty entry in the hash table, use it and quit hash_table [probe [i]] = next; return first; } } // all entries (probes) for this item are full therefore ... // pick an random entry i = rand () % N; // swap the random entry with the current item thus saving // the current item and getting a new one T * temp = hash_table [probe [i]]; hash_table [probe [i]] = next; next = temp; // and try again (trying to insert a new entry this time) } // create an array of items for linear searching if we fail // to insert into the hash table // note: lousy allocation strategy ... leads to memory fragmentation // but ideally overflow is not used that much so not big whooptie-do T * * new_overflow = new T * [overflow_size + 1]; for (int i = 0; i < overflow_size; i++) new_overflow [i] = overflow [i]; new_overflow [overflow_size] = next; delete [] overflow; overflow = new_overflow; overflow_size ++; test_for_resize (); return first; } // resize the hash table when overflow_size gets so large // that the searching performance drops below a certain threshold // (ideally overflow_size should be zero all the time) template <class T> void HashTable<T>::resize (int new_hash_table_size) { int i; int old_hash_table_size = hash_table_size; T * * old_hash_table = hash_table; int old_overflow_size = overflow_size; T * * old_overflow = overflow; hash_table_size = new_hash_table_size; hash_table = new T * [hash_table_size]; for (i = 0; i < hash_table_size; i++) hash_table [i] = NULL; // all entries gets set to NULL overflow = NULL; overflow_size = 0; for (i = 0; i < old_hash_table_size; i++) { if (old_hash_table [i] != NULL) { insert (* old_hash_table [i]); delete old_hash_table [i]; } } for (i = 0; i < old_overflow_size; i++) { insert (* old_overflow [i]); delete old_overflow [i]; } delete [] old_hash_table; delete [] old_overflow; } // sample program // simple data type ... uses an integer as a key class simple { public : int key; simple (int v) { key = v; } private : friend class HashTable<simple>; // required member functions simple (simple & item) { key = item.key; } bool operator == (simple & rhs) { return key == rhs.key; } int next; int first_hash_key () { next = key; return (next * 101) & 0x7fffffff; } int next_hash_key () { next ++; return (next * 103) & 0x7fffffff; } }; #include <iostream> int main () { HashTable<simple> ht (256); // insert a know value simple x (10); ht.insert (x); std::cout << "insert: " << x.key << std::endl; // insert 5000 random values for (int i = 0; i < 5000; i++) { simple s (rand ()); ht.insert (s); std::cout << s.key << " " << 100.0 * ht.count () / ht.size () << " " << ht.over () << std::endl; } // find the know value simple * s = ht.find (x); std::cout << "found: " << s->key << std::endl; return 0; }

Hash House Harriers	hash	hash function	Linear Hashing
hash table	symbiotic functions dancing in tandem	Euler Phi function	bitcoin
Kerckhoffs' Requirements	random walk	hares and hounds	BIOS password
static hashing	MD5 hash function	Indice	continuous
Mac	Lords of Acid	performance	brewing
Allessandra, the blonde from Wyoming	rainbow tables	March 23, 2006	John Adams's 1798 State of the Union Address

Recommended Reading

About Everything2

User Picks

Editor Picks

New Writeups