Bloom filter
Table of Contents
Overview
A Bloom filter is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set.1
Algorithm description
- An empty Bloom filter is a bit array of m bits, all set to 0. There must also be k different hash functions defined, each of which maps or hashes some set element to one of the m array positions with a uniform random distribution.
- To add an element, feed it to each of the k hash functions to get k array positions. Set the bits at all these positions to 1.
- To query for an element (test whether it is in the set), feed it to each of the k hash functions to get k array positions. If any of the bits at these positions are 0, the element is definitely not in the set – if it were, then all the bits would have been set to 1 when it was inserted. If all are 1, then either the element is in the set, or the bits have by chance been set to 1 during the insertion of other elements, resulting in a false positive.
Implementation in C
#ifndef __BLOOM_H__ #define __BLOOM_H__ #include <stdlib.h> typedef unsigned int (*hashfunc_t)(const char *); typedef struct{ size_t size; unsigned char *a; size_t nhashfuncs; hashfunc_t *hashfuncs; } BLOOM; BLOOM *bloom_create(size_t size, size_t nhashfuncs, ...); int bloom_destroy(BLOOM *bloom); int bloom_add(BLOOM *bloom, const char *s); int bloom_check(BLOOM *bloom, const char *s); #endif
#include <limits.h> #include <stdarg.h> #include "bloom.h" #define SETBIT(a, n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT))) #define GETBIT(a, n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT))) BLOOM *bloom_create(size_t size, size_t nhashfuncs, ...) { BLOOM *bloom; va_list l; int n; if(!(bloom = malloc(sizeof(BLOOM)))) return NULL; if(!(bloom->a = calloc((size + CHAR_BIT - 1) / CHAR_BIT, sizeof(char)))){ free(bloom); return NULL; } if(!(bloom->hashfuncs = (hashfunc_t *) malloc(nhashfuncs * sizeof(hashfunc_t)))){ free(bloom->a); free(bloom); return NULL; } va_start(l, nhashfuncs); for(n = 0; n < nhashfuncs; ++n){ bloom->hashfuncs[n] = va_arg(l, hashfunc_t); } va_end(l); bloom->size = size; bloom->nhashfuncs = nhashfuncs; return bloom; } int bloom_destroy(BLOOM *bloom) { free(bloom->hashfuncs); free(bloom->a); free(bloom); return 0; } int bloom_add(BLOOM *bloom, const char *s) { size_t n; for(n = 0; n < bloom->nhashfuncs; ++n){ SETBIT(bloom->a, bloom->hashfuncs[n](s)%bloom->size); } return 0; } int bloom_check(BLOOM *bloom, const char *s) { size_t n; for(n = 0; n < bloom->nhashfuncs; ++n) if(!(GETBIT(bloom->a, bloom->hashfuncs[n](s)%bloom->size))) return 0; return 1; }
#include <stdio.h> #include <string.h> #include "bloom.h" unsigned int sax_hash(const char *key) { unsigned int h = 0; while(*key) h ^= (h << 5) + (h >> 2) + (unsigned char)*key++; return h; } unsigned int sdbm_hash(const char *key) { unsigned int h = 0; while(*key) h = (unsigned char)*key++ + (h << 6) + (h << 16) - h; return h; } int main(int argc, char *argv[]) { FILE *fp; char line[1024]; char *p; BLOOM *bloom; if(argc < 2){ fprintf(stderr, "ERROR, No word file specified\n"); return EXIT_FAILURE; } if(!(bloom = bloom_create(250000, 2, sax_hash, sdbm_hash))){ fprintf(stderr, "ERROR: Could not create bloom filter\n"); return EXIT_FAILURE; } if(!(fp = fopen(argv[1], "r"))){ fprintf(stderr, "ERROR: Could not open the file:%s\n", argv[1]); return EXIT_FAILURE; } while(fgets(line, 1024, fp)){ if((p = strchr(line, '\r'))) *p = '\0'; if((p = strchr(line, '\n'))) *p = '\0'; bloom_add(bloom, line); } fclose(fp); while(fgets(line, 1024, stdin)){ if((p = strchr(line, '\r'))) *p = '\0'; if((p = strchr(line, '\n'))) *p = '\0'; p=strtok(line, " \t,.;:\r\n?!-/()"); while(p){ if(!bloom_check(bloom, p)) printf("No match for word: %s\n", p); p = strtok(NULL," \t,.;:\r\n?!-/()"); } } bloom_destroy(bloom); return EXIT_SUCCESS; }
all: bloom bloom: bloom.o test.o gcc -o bloom -Wall -pedantic bloom.o test.o bloom.o: bloom.c bloom.h gcc -o bloom.o -Wall -pedantic -ansi -c bloom.c test.o: test.c bloom.h gcc -o test.o -Wall -pedantic -ansi -c test.c
Test files: wordlist and check
make ./test wordlist < check