bloom.h
7.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/*
* Copyright (C) 2014 Freie Universität Berlin
*
* This file is subject to the terms and conditions of the GNU Lesser
* General Public License v2.1. See the file LICENSE in the top level
* directory for more details.
*/
/*
* bloom.c
*
* Bloom filters
*
* HISTORY
* {x, y, z}
* A Bloom filter is a probibalistic : : :
* data structure with several interesting /|\ /|\ /|\
* properties, such as low memory usage, / | X | X | \
* asymmetric query confidence, and a very / |/ \|/ \| \
* speedy O(k) membership test. / | | \ \
* / /| /|\ |\ \
* Because a Bloom filter can . . . . . . . . .
* accept any input that can be 00000000001000101010101010100010000000000
* hashed effectively (such as " " "
* strings), that membership test \ | /
* tends to draw a crowd. TNSTAAFL, but \ | /
* as caveats go, the Bloom filters' are \ | /
* more interesting than incapacitating. \|/
* :
* Most notably, it can tell you with certainty {w}
* that an item 'i' is *not* a member of set 's',
* but it can only tell you with some finite
* probability whether an item 'i' *is* a member
* of set 's'.
*
* Still, along with the intriguing possibility of using bitwise AND and OR
* to compute the logical union and intersection of two filters, the cheap
* cost of adding elements to the filter set, and the low memory requirements,
* the Bloom filter is a good choice for many applications.
*
* NOTES
*
* Let's look more closely at the probability values.
*
* Assume that a hash function selects each array position with equal
* probability. If m is the number of bits in the array, and k is the number
* of hash functions, then the probability that a certain bit is not set
* to 1 by a certain hash function during the insertion of an element is
*
* 1-(1/m).
*
* The probability that it is not set to 1 by any of the hash functions is
*
* (1-(1/m))^k.
*
* If we have inserted n elements, the probability that a certain bit is
* set 0 is
*
* (1-(1/m))^kn,
*
* Meaning that the probability said bit is set to 1 is therefore
*
* 1-([1-(1/m)]^kn).
*
* Now test membership of an element that is not in the set. Each of the k
* array positions computed by the hash functions is 1 with a probability
* as above. The probability of all of them being 1, which would cause the
* algorithm to erroneously claim that the element is in the set, is often
* given as
*
* (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
*
* This is not strictly correct as it assumes independence for the
* probabilities of each bit being set. However, assuming it is a close
* approximation we have that the probability of false positives descreases
* as m (the number of bits in the array) increases, and increases as n
* (the number of inserted elements) increases. For a given m and n, the
* value of k (the number of hash functions) that minimizes the probability
* is
*
* (m/n)ln(2) ~~ 0.7(m/n),
*
* which gives the false positive probability of
*
* 2^-k ~~ 0.6185^(m/n).
*
* The required number of bits m, given n and a desired false positive
* probability p (and assuming the optimal value of k is used) can be
* computed by substituting the optimal value of k in the probability
* expression above:
*
* p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
*
* which simplifies to
*
* ln(p) = -(m/n) * (ln2)^2.
*
* This results in the equation
*
* m = -((n*ln(p)) / ((ln(2))^2))
*
* The classic filter uses
*
* 1.44*log2(1/eta)
*
* bits of space per inserted key, where eta is the false positive rate of
* the Bloom filter.
*
*/
/**
* @defgroup sys_bloom Bloom filter
* @ingroup sys
* @brief Bloom filter library
* @{
*
* @file
* @brief Bloom filter API
*
* @author Christian Mehlis <mehlis@inf.fu-berlin.de>
*/
#ifndef BLOOM_H
#define BLOOM_H
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief hash function to use in thee filter
*/
typedef uint32_t (*hashfp_t)(const uint8_t *, int len);
/**
* @brief bloom_t bloom filter object
*/
typedef struct {
/** number of bits in the bloom array */
size_t m;
/** number of hash functions */
size_t k;
/** the bloom array */
uint8_t *a;
/** the hash functions */
hashfp_t *hash;
} bloom_t;
/**
* @brief Initialize a Bloom Filter.
*
* @note For best results, make 'size' a power of 2.
*
* @param bloom bloom_t to initialize
* @param size size of the bloom filter in bits
* @param bitfield underlying bitfield of the bloom filter
* @param hashes array of hashes
* @param hashes_numof number of elements in hashes
*
* @pre @p bitfield MUST be large enough to hold @p size bits.
*/
void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof);
/**
* @brief Delete a Bloom filter.
*
* @param bloom The condemned
* @return nothing
*
*/
void bloom_del(bloom_t *bloom);
/**
* @brief Add a string to a Bloom filter.
*
* CAVEAT
* Once a string has been added to the filter, it cannot be "removed"!
*
* @param bloom Bloom filter
* @param buf string to add
* @param len the length of the string @p buf
* @return nothing
*
*/
void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len);
/**
* @brief Determine if a string is in the Bloom filter.
*
* The string 's' is hashed once for each of the 'k' hash functions, as
* though we were planning to add it to the filter. Instead of adding it
* however, we examine the bit that we *would* have set, and consider its
* value.
*
* If the bit is 1 (set), the string we are hashing may be in the filter,
* since it would have set this bit when it was originally hashed. However,
* it may also be that another string just happened to produce a hash value
* that would also set this bit. That would be a false positive. This is why
* we have k > 1, so we can minimize the likelihood of false positives
* occuring.
*
* If every bit corresponding to every one of the k hashes of our query
* string is set, we can say with some probability of being correct that
* the string we are holding is indeed "in" the filter. However, we can
* never be sure.
*
* If, however, as we hash our string and peek at the resulting bit in the
* filter, we find the bit is 0 (not set)... well now, that's different.
* In this case, we can say with absolute certainty that the string we are
* holding is *not* in the filter, because if it were, this bit would have
* to be set.
*
* In this way, the Bloom filter can answer NO with absolute surety, but
* can only speak a qualified YES.
*
* @param bloom Bloom filter
* @param buf string to check
* @param len the length of the string @p buf
*
*
* @return false if string does not exist in the filter
* @return true if string is may be in the filter
*
*/
bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len);
#ifdef __cplusplus
}
#endif
/** @} */
#endif /* BLOOM_H */