Bloom filter

Table of Contents

Overview

A Bloom filter is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set.1

Algorithm description

  1. An empty Bloom filter is a bit array of m bits, all set to 0. There must also be k different hash functions defined, each of which maps or hashes some set element to one of the m array positions with a uniform random distribution.
  2. To add an element, feed it to each of the k hash functions to get k array positions. Set the bits at all these positions to 1.
  3. To query for an element (test whether it is in the set), feed it to each of the k hash functions to get k array positions. If any of the bits at these positions are 0, the element is definitely not in the set – if it were, then all the bits would have been set to 1 when it was inserted. If all are 1, then either the element is in the set, or the bits have by chance been set to 1 during the insertion of other elements, resulting in a false positive.

Implementation in C

bloom.h

#ifndef __BLOOM_H__
#define __BLOOM_H__

#include <stdlib.h>
typedef unsigned int (*hashfunc_t)(const char *);

typedef struct{
  size_t size;
  unsigned char *a;
  size_t nhashfuncs;
  hashfunc_t *hashfuncs;
} BLOOM;

BLOOM *bloom_create(size_t size, size_t nhashfuncs, ...);
int bloom_destroy(BLOOM *bloom);
int bloom_add(BLOOM *bloom, const char *s);
int bloom_check(BLOOM *bloom, const char *s);

#endif

bloom.c

#include <limits.h>
#include <stdarg.h>

#include "bloom.h"

#define SETBIT(a, n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT)))
#define GETBIT(a, n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT)))

BLOOM *bloom_create(size_t size, size_t nhashfuncs, ...)
{
  BLOOM *bloom;
  va_list l;
  int n;
  if(!(bloom = malloc(sizeof(BLOOM)))) return NULL;
  if(!(bloom->a = calloc((size + CHAR_BIT - 1) / CHAR_BIT, sizeof(char)))){
    free(bloom);
    return NULL;
  }
  if(!(bloom->hashfuncs = (hashfunc_t *) malloc(nhashfuncs * sizeof(hashfunc_t)))){
    free(bloom->a);
    free(bloom);
    return NULL;
  }

  va_start(l, nhashfuncs);
  for(n = 0; n < nhashfuncs; ++n){
    bloom->hashfuncs[n] = va_arg(l, hashfunc_t);
  }
  va_end(l);

  bloom->size = size;
  bloom->nhashfuncs = nhashfuncs;

  return bloom;
}

int bloom_destroy(BLOOM *bloom)
{
  free(bloom->hashfuncs);
  free(bloom->a);
  free(bloom);
  return 0;
}

int bloom_add(BLOOM *bloom, const char *s)
{
  size_t n;
  for(n = 0; n < bloom->nhashfuncs; ++n){
    SETBIT(bloom->a, bloom->hashfuncs[n](s)%bloom->size);
  }
  return 0;
}

int bloom_check(BLOOM *bloom, const char *s)
{
  size_t n;
  for(n = 0; n < bloom->nhashfuncs; ++n)
    if(!(GETBIT(bloom->a, bloom->hashfuncs[n](s)%bloom->size)))
      return 0;
  return 1;
}

test.c

#include <stdio.h>
#include <string.h>

#include "bloom.h"

unsigned int sax_hash(const char *key)
{
  unsigned int h = 0;
  while(*key)
    h ^= (h << 5) + (h >> 2) + (unsigned char)*key++;
  return h;
}

unsigned int sdbm_hash(const char *key)
{
  unsigned int h = 0;
  while(*key)
    h = (unsigned char)*key++ + (h << 6) + (h << 16) - h;
  return h;
}

int main(int argc, char *argv[])
{
  FILE *fp;
  char line[1024];
  char *p;
  BLOOM *bloom;

  if(argc < 2){
    fprintf(stderr, "ERROR, No word file specified\n");
    return EXIT_FAILURE;
  }

  if(!(bloom = bloom_create(250000, 2, sax_hash, sdbm_hash))){
    fprintf(stderr, "ERROR: Could not create bloom filter\n");
    return EXIT_FAILURE;
  }

  if(!(fp = fopen(argv[1], "r"))){
    fprintf(stderr, "ERROR: Could not open the file:%s\n", argv[1]);
    return EXIT_FAILURE;
  }

  while(fgets(line, 1024, fp)){
    if((p = strchr(line, '\r'))) *p = '\0';
    if((p = strchr(line, '\n'))) *p = '\0';
    bloom_add(bloom, line);
  }

  fclose(fp);

  while(fgets(line, 1024, stdin)){
    if((p = strchr(line, '\r'))) *p = '\0';
    if((p = strchr(line, '\n'))) *p = '\0';

    p=strtok(line, " \t,.;:\r\n?!-/()");
    while(p){
      if(!bloom_check(bloom, p))
        printf("No match for word: %s\n", p);
      p = strtok(NULL," \t,.;:\r\n?!-/()");
    }
  }
  bloom_destroy(bloom);

  return EXIT_SUCCESS;
}

Makefile

all: bloom

bloom: bloom.o test.o
        gcc -o bloom -Wall -pedantic bloom.o test.o

bloom.o: bloom.c bloom.h
        gcc -o bloom.o -Wall -pedantic -ansi -c bloom.c
test.o: test.c bloom.h
        gcc -o test.o -Wall -pedantic -ansi -c test.c

Test files: wordlist and check

make
./test wordlist < check

Footnotes:

Author: Shi Shougang

Created: 2016-11-08 Tue 23:16

Emacs 24.3.1 (Org mode 8.2.10)

Validate