Lookups on known set of integer keys
Gperf consistently underperforms a Judy array in my environment, and I'm wondering if there is another perfect hash library out there built specifically for integer keys. I know the set of keys beforehand, and I would like to leverage that into a performance/size advantage.
There are ~1000 keys, and retrievals are not in sequential order. Key pairs are both integers. Keys are 32-bit, and retrieved values are 8-bit. Size is the most important factor.
If there is a way to tweak Gperf for integer keys, or just another approach in general, I'm all ears, too. :)
(Sidenote: ...While typing out this question, I realized a binary search probably takes the cake and I've just over-thought the problem. I'd still like to hear any thoughts you may have for the sake of learning, though!)
Edit: Keys are not evenly distributed. Most are randomly clustered throughout the entire possible range.
Edit 2: Worst-case binary searches were too slow for my taste, so I ended up playing with the keys until I found 8 bits to use from each to make 256 well-distributed buckets. I held the min & max of each bucket (24 bits for each bucket entry), and made one big struct array for the key-pairs. On-par/faster and smaller than everything else I tested with for my particular case, so I guess I'm going with that for now. :)
Solution 1:
Keep your keys sorted, and use an M-tree to retrieve any key.
An M-tree has M entry per node, instead of 2 for binaries. It will help tremendously for performance. Use a cache line size as the basis of your node size, hence 64 Bytes. You can store 16 32bits values in this size.
Since you have 1000 values, 3 levels will be enough to retrieve the right key (as opposed to 10 levels for a binary tree).
Another idea would be to hash your keys into a small hash-table, such as 12-bits one (4K entries), and solve the potential collisions with a simple chain. You are very likely to get most of your keys in a single search.
Solution 2:
/*
** Proof of concept for constructing a {fixed-size,lookup-only} hashtable
** needing only (2*N* sizeof(int)) additional storage for storing N num=value pairs.
** The key is supposed to be an int,
** the 'value' is a char.
** Note: some people would like to include <stdint.h> and replace all the ints by {uint32_t,uint16_t,uint8_t}.
**
** (c)opyright Wildplasser, 2011-11-12
** href = http://stackoverflow.com/questions/8059591/lookups-on-known-set-of-integer-keys
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
struct tinyhash {
unsigned key;
unsigned short link;
unsigned char value;
unsigned is_linked :1;
};
#define LINK_DEAD ((unsigned short)-1)
/* A hashtable with N slots for N entries (100% full) */
#define THE_SIZE 1000
struct tinyhash table[THE_SIZE] ;
void tiny_fill(void);
void tiny_init(void);
int tiny_find(unsigned key);
void tiny_print(void);
void tiny_count(void);
static int tiny_cmp( const void *l, const void *r);
static unsigned short * tiny_hnd(unsigned key);
static unsigned tiny_hash(unsigned key);
int main (void)
{
assert(sizeof table == 2 * THE_SIZE * sizeof(int));
fprintf (stderr, "Size=%u\n", (unsigned int) sizeof table);
tiny_fill();
tiny_init();
tiny_print();
tiny_count();
return 0;
}
/* Perform a table lookup.
** Return the "value" that belongs to "key"..
** If not found -1 is returned.
*/
int tiny_find(unsigned key)
{
unsigned short *ptr;
ptr = tiny_hnd(key);
return *ptr==LINK_DEAD ? -1 : table[*ptr].value ;
}
/* Find the place where key is located, or
** (if not found) where it should be appendend.
** The returned value is a pointer to the parent's .link field.
*/
static unsigned short * tiny_hnd(unsigned key)
{
unsigned hash;
unsigned short slot, *ptr;
hash = tiny_hash(key);
slot = hash % THE_SIZE;
for (ptr = &table[slot].link; *ptr != LINK_DEAD; ptr = &table[*ptr].link ) {
if ( !table[*ptr].is_linked ) break;
if ( table[*ptr].key == key) break;
}
return ptr;
}
/* For testing: fill hashtable with garbage */
void tiny_fill(void)
{
unsigned idx;
for (idx=0; idx < THE_SIZE; idx++ ) {
table[idx].key = rand() + 543 * rand();
table[idx].value = rand() ;
table[idx].link = LINK_DEAD;
table[idx].is_linked = 0;
}
}
/* Build hashtable, that is:
** shuffle the entries and build linked list.
*/
void tiny_init(void)
{
unsigned idx;
/* Phase 0: set all to unused.
** The link field is temporaly abused to store the intended
** slotnumber.
*/
for (idx=0; idx < THE_SIZE; idx++ ) {
table[idx].link = tiny_hash(table[idx].key) % THE_SIZE;
table[idx].is_linked = 0;
}
/* Phase 0a: sort on (intended) slotnumber. */
qsort (table, THE_SIZE, sizeof table[0] , tiny_cmp);
/* Phase 1: put enties at their intended slotnumber
** but only if the entry that lives there does not belong there
** (is uninitialized).
*/
for (idx=0; idx < THE_SIZE; idx++) {
unsigned slot;
/* [idx] has allready been placed */
if (table[idx].is_linked) continue;
slot = table[idx].link;
/* [idx] belongs here. freeze it */
if (slot==idx) {
table[idx].link = LINK_DEAD;
table[idx].is_linked = 1;
}
/* try to swap [idx] with the item at its intended slot */
else {
struct tinyhash tmp;
/* item[slot] belongs there: keep off */
if (table[slot].is_linked) continue;
tmp = table[idx];
table[idx] = table[slot];
table[slot] = tmp;
table[slot].is_linked = 1;
table[slot].link = LINK_DEAD;
/* Don't bump idx: [idx] and [slot] have been swapped,
** we need to inspect the new item[idx] at the next cycle.
*/
idx--; /* idx will be re-incremented in the loop; */
}
}
/* Phase 2: link any remaining unplaced item to its
** parent in the LL-chain.
*/
for (idx=0; idx < THE_SIZE; idx++ ) {
unsigned short *parent;
if (table[idx].is_linked) continue;
parent = tiny_hnd(table[idx].key);
if (*parent != LINK_DEAD) continue; /* duplicate */
*parent = idx;
table[idx].is_linked = 1;
table[idx].link = LINK_DEAD;
}
}
/* Compare function for qsort()
*/
static int tiny_cmp( const void *vl, const void *vr)
{
struct tinyhash *l = (struct tinyhash *)vl;
struct tinyhash *r = (struct tinyhash *)vr;
#if 0
unsigned slot_l, slot_r;
slot_l = tiny_hash(l->key) %THE_SIZE;
slot_r = tiny_hash(r->key) %THE_SIZE;
if (slot_l < slot_r ) return -3;
if (slot_l > slot_r ) return 3;
#else
if (l->link < r->link ) return -3;
if (l->link > r->link ) return 3;
#endif
if (l->key < r->key) return -2;
if (l->key > r->key) return 2;
if (l < r) return -1;
if (l > r) return 1;
return 0;
}
/* Stupid hashfunction, to be replaced with something usefull..
** (works good for random ints) Use at your own risk.
*/
static unsigned tiny_hash(unsigned key)
{
return key * 98765;
}
void tiny_print(void)
{
unsigned idx;
for (idx=0; idx < THE_SIZE; idx++ ) {
unsigned slot;
int dir;
slot = tiny_hash(table[idx].key) % THE_SIZE;
dir = (slot==idx) ? '=' : (slot>idx) ? '<': '>';
fprintf(stderr, "[%4x] %c %4x: %4x %c %10u->%3u\n"
, idx, dir, 0xffff & slot
, 0xffff & table[idx].link
, table[idx].is_linked ? '*' : '.'
, table[idx].key,table[idx].value
);
}
}
/* For testing: print the hashchains, construct a histogram of chainlengths,
** and calculate the "total cost of retrieval".
*/
void tiny_count(void)
{
unsigned idx, cnt, len, tothops, slot;
unsigned histogram[THE_SIZE];
memset(histogram, 0, sizeof histogram);
cnt=tothops=0;
for (slot =0; slot < THE_SIZE; slot++ ) {
idx = tiny_hash(table[slot].key) % THE_SIZE;
if (slot!=idx) continue; /* this is not the start of a chain */
for (len=0 ; idx != LINK_DEAD; idx = table[idx].link) {
if (!table[idx].is_linked) continue;
if (len==0) fprintf(stderr, "[%u]:", slot);
fprintf(stderr, " %u", table[idx].key);
len++;
}
fprintf(stderr, "(=%u)\n", len);
cnt += len;
histogram[len] += 1;
tothops += (((len+1) *len)/2);
}
fprintf(stderr, "Histogram of chainlengths:\n");
for (len=0; len < THE_SIZE; len++) {
if (!histogram[len]) continue;
fprintf(stderr, "[%u]: %u\n", len, histogram[len]);
}
fprintf(stderr, "tothops=%u/%u (%f hops per node)\n"
, tothops, cnt, (double)tothops/cnt);
}
The result: (well: some of it)
....
[978]: 1794172570(=1)
[980]: 642121828(=1)
[983]: 2674104091(=1)
[985]: 547527125(=1)
[986]: 2383911594(=1)
[988]: 4254837348(=1)
[989]: 1860851825 1990214465 1766290905(=3)
[990]: 3793608270 469685686(=2)
[992]: 1189958296 872917240(=2)
[994]: 1999781290 1501026482(=2)
[995]: 520334159 211600319(=2)
[997]: 177583921(=1)
[998]: 1173163646 1013572158(=2)
[999]: 1705614211 3460318251(=2)
Histogram of chainlengths:
[1]: 369
[2]: 190
[3]: 57
[4]: 15
[5]: 4
tothops=1491/1000 (1.491000 hops per node)
Note: because of the sorting while initialising the hashtable, entries are very close to the place where they are expected. This increases the locality of reference.