Wire Sysio Wire Sysion 1.0.0
Loading...
Searching...
No Matches
bloom_filter.hpp
Go to the documentation of this file.
1#pragma once
2
3/*
4 *********************************************************************
5 * *
6 * Open Bloom Filter *
7 * *
8 * Author: Arash Partow - 2000 *
9 * URL: http://www.partow.net *
10 * URL: http://www.partow.net/programming/hashfunctions/index.html *
11 * *
12 * Copyright notice: *
13 * Free use of the Open Bloom Filter Library is permitted under the *
14 * guidelines and in accordance with the most current version of the *
15 * Common Public License. *
16 * http://www.opensource.org/licenses/cpl1.0.php *
17 * *
18 *********************************************************************
19*/
20
21#include <algorithm>
22#include <cmath>
23#include <cstddef>
24#include <iterator>
25#include <limits>
26#include <string>
27#include <vector>
28
30
31namespace fc {
32
33
34static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
35static const unsigned char bit_mask[bits_per_char] = {
36 0x01, //00000001
37 0x02, //00000010
38 0x04, //00000100
39 0x08, //00001000
40 0x10, //00010000
41 0x20, //00100000
42 0x40, //01000000
43 0x80 //10000000
44 };
45
47{
48public:
49
51 : minimum_size(1),
52 maximum_size(std::numeric_limits<unsigned long long int>::max()),
54 maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
57 random_seed(0xA5A5A5A55A5A5A5AULL)
58 {}
59
61 {}
62
63 inline bool operator!()
64 {
65 return (minimum_size > maximum_size) ||
71 (std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
72 (0 == random_seed) ||
73 (0xFFFFFFFFFFFFFFFFULL == random_seed);
74 }
75
76 //Allowed min/max size of the bloom filter in bits
77 unsigned long long int minimum_size;
78 unsigned long long int maximum_size;
79
80 //Allowed min/max number of hash functions
83
84 //The approximate number of elements to be inserted
85 //into the bloom filter, should be within one order
86 //of magnitude. The default is 10000.
87 unsigned long long int projected_element_count;
88
89 //The approximate false positive probability expected
90 //from the bloom filter. The default is the reciprocal
91 //of the projected_element_count.
93
94 unsigned long long int random_seed;
95
97 {
102
103 unsigned int number_of_hashes;
104 unsigned long long int table_size;
105 };
106
108
110 {
111 /*
112 Note:
113 The following will attempt to find the number of hash functions
114 and minimum amount of storage bits required to construct a bloom
115 filter consistent with the user defined false positive probability
116 and estimated element insertion count.
117 */
118
119 if (!(*this))
120 return false;
121
122 double min_m = std::numeric_limits<double>::infinity();
123 double min_k = 0.0;
124 double curr_m = 0.0;
125 double k = 1.0;
126
127 while (k < 1000.0)
128 {
129 double numerator = (- k * projected_element_count);
130 double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
131 curr_m = numerator / denominator;
132 if (curr_m < min_m)
133 {
134 min_m = curr_m;
135 min_k = k;
136 }
137 k += 1.0;
138 }
139
141
142 optp.number_of_hashes = static_cast<unsigned int>(min_k);
143 optp.table_size = static_cast<unsigned long long int>(min_m);
144 optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
145
150
151 if (optp.table_size < minimum_size)
153 else if (optp.table_size > maximum_size)
155
156 return true;
157 }
158
159};
160
162{
163protected:
164
165 typedef unsigned int bloom_type;
166 typedef unsigned char cell_type;
167
168public:
169
179
181 : projected_element_count_(p.projected_element_count),
183 random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
184 desired_false_positive_probability_(p.false_positive_probability)
185 {
186 salt_count_ = p.optimal_parameters.number_of_hashes;
187 table_size_ = p.optimal_parameters.table_size;
189 raw_table_size_ = table_size_ / bits_per_char;
190
191 bit_table_.resize( static_cast<std::size_t>(raw_table_size_) );
192 //bit_table_ = new cell_type[static_cast<std::size_t>(raw_table_size_)];
193 std::fill_n(bit_table_.data(),raw_table_size_,0x00);
194 }
195
197 {
198 this->operator=(filter);
199 }
200
201 inline bool operator == (const bloom_filter& f) const
202 {
203 if (this != &f)
204 {
205 return
206 (salt_count_ == f.salt_count_) &&
207 (table_size_ == f.table_size_) &&
208 (raw_table_size_ == f.raw_table_size_) &&
209 (projected_element_count_ == f.projected_element_count_) &&
210 (inserted_element_count_ == f.inserted_element_count_) &&
211 (random_seed_ == f.random_seed_) &&
212 (desired_false_positive_probability_ == f.desired_false_positive_probability_) &&
213 (salt_ == f.salt_) &&
214 std::equal(f.bit_table_.data(),f.bit_table_.data() + raw_table_size_,bit_table_.data());
215 }
216 else
217 return true;
218 }
219
220 inline bool operator != (const bloom_filter& f) const
221 {
222 return !operator==(f);
223 }
224
226 {
227 if (this != &f)
228 {
229 salt_count_ = f.salt_count_;
230 table_size_ = f.table_size_;
231 raw_table_size_ = f.raw_table_size_;
232 projected_element_count_ = f.projected_element_count_;
233 inserted_element_count_ = f.inserted_element_count_;
234 random_seed_ = f.random_seed_;
235 desired_false_positive_probability_ = f.desired_false_positive_probability_;
236 bit_table_.resize( raw_table_size_ );
237 std::copy(f.bit_table_.data(),f.bit_table_.data() + raw_table_size_,bit_table_.data());
238 salt_ = f.salt_;
239 }
240 return *this;
241 }
242
244 {
245 }
246
247 inline bool operator!() const
248 {
249 return (0 == table_size_);
250 }
251
252 inline void clear()
253 {
254 std::fill_n(bit_table_.data(),raw_table_size_,0x00);
256 }
257
258 inline void insert(const unsigned char* key_begin, const std::size_t& length)
259 {
260 std::size_t bit_index = 0;
261 std::size_t bit = 0;
262 for (std::size_t i = 0; i < salt_.size(); ++i)
263 {
264 compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
265 bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
266 }
268 }
269
270 template<typename T>
271 inline void insert(const T& t)
272 {
273 // Note: T must be a C++ POD type.
274 insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
275 }
276
277 inline void insert(const std::string& key)
278 {
279 insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
280 }
281
282 inline void insert(const char* data, const std::size_t& length)
283 {
284 insert(reinterpret_cast<const unsigned char*>(data),length);
285 }
286
287 template<typename InputIterator>
288 inline void insert(const InputIterator begin, const InputIterator end)
289 {
290 InputIterator itr = begin;
291 while (end != itr)
292 {
293 insert(*(itr++));
294 }
295 }
296
297 inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
298 {
299 std::size_t bit_index = 0;
300 std::size_t bit = 0;
301 for (std::size_t i = 0; i < salt_.size(); ++i)
302 {
303 compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
304 if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
305 {
306 return false;
307 }
308 }
309 return true;
310 }
311
312 template<typename T>
313 inline bool contains(const T& t) const
314 {
315 return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
316 }
317
318 inline bool contains(const std::string& key) const
319 {
320 return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
321 }
322
323 inline bool contains(const char* data, const std::size_t& length) const
324 {
325 return contains(reinterpret_cast<const unsigned char*>(data),length);
326 }
327
328 template<typename InputIterator>
329 inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
330 {
331 InputIterator itr = begin;
332 while (end != itr)
333 {
334 if (!contains(*itr))
335 {
336 return itr;
337 }
338 ++itr;
339 }
340 return end;
341 }
342
343 template<typename InputIterator>
344 inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
345 {
346 InputIterator itr = begin;
347 while (end != itr)
348 {
349 if (contains(*itr))
350 {
351 return itr;
352 }
353 ++itr;
354 }
355 return end;
356 }
357
358 inline virtual unsigned long long int size() const
359 {
360 return table_size_;
361 }
362
363 inline std::size_t element_count() const
364 {
366 }
367
368 inline double effective_fpp() const
369 {
370 /*
371 Note:
372 The effective false positive probability is calculated using the
373 designated table size and hash function count in conjunction with
374 the current number of inserted elements - not the user defined
375 predicated/expected number of inserted elements.
376 */
377 return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
378 }
379
381 {
382 /* intersection */
383 if (
384 (salt_count_ == f.salt_count_) &&
385 (table_size_ == f.table_size_) &&
386 (random_seed_ == f.random_seed_)
387 )
388 {
389 for (std::size_t i = 0; i < raw_table_size_; ++i)
390 {
391 bit_table_[i] &= f.bit_table_[i];
392 }
393 }
394 return *this;
395 }
396
398 {
399 /* union */
400 if (
401 (salt_count_ == f.salt_count_) &&
402 (table_size_ == f.table_size_) &&
403 (random_seed_ == f.random_seed_)
404 )
405 {
406 for (std::size_t i = 0; i < raw_table_size_; ++i)
407 {
408 bit_table_[i] |= f.bit_table_[i];
409 }
410 }
411 return *this;
412 }
413
415 {
416 /* difference */
417 if (
418 (salt_count_ == f.salt_count_) &&
419 (table_size_ == f.table_size_) &&
420 (random_seed_ == f.random_seed_)
421 )
422 {
423 for (std::size_t i = 0; i < raw_table_size_; ++i)
424 {
425 bit_table_[i] ^= f.bit_table_[i];
426 }
427 }
428 return *this;
429 }
430
431 inline const cell_type* table() const
432 {
433 return bit_table_.data();
434 }
435
436 inline std::size_t hash_count()
437 {
438 return salt_.size();
439 }
440
441protected:
442
443 inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
444 {
445 bit_index = hash % table_size_;
446 bit = bit_index % bits_per_char;
447 }
448
450 {
451 /*
452 Note:
453 A distinct hash function need not be implementation-wise
454 distinct. In the current implementation "seeding" a common
455 hash function with different values seems to be adequate.
456 */
457 const unsigned int predef_salt_count = 128;
458 static const bloom_type predef_salt[predef_salt_count] =
459 {
460 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
461 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
462 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
463 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
464 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
465 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
466 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
467 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
468 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
469 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
470 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
471 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
472 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
473 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
474 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
475 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
476 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
477 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
478 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
479 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
480 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
481 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
482 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
483 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
484 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
485 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
486 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
487 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
488 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
489 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
490 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
491 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
492 };
493
494 if (salt_count_ <= predef_salt_count)
495 {
496 std::copy(predef_salt,
497 predef_salt + salt_count_,
498 std::back_inserter(salt_));
499 for (unsigned int i = 0; i < salt_.size(); ++i)
500 {
501 /*
502 Note:
503 This is done to integrate the user defined random seed,
504 so as to allow for the generation of unique bloom filter
505 instances.
506 */
507 salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
508 }
509 }
510 else
511 {
512 std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
513 srand(static_cast<unsigned int>(random_seed_));
514 while (salt_.size() < salt_count_)
515 {
516 bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
517 if (0 == current_salt) continue;
518 if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
519 {
520 salt_.push_back(current_salt);
521 }
522 }
523 }
524 }
525
526 inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
527 {
528 const unsigned char* itr = begin;
529 unsigned int loop = 0;
530 while (remaining_length >= 8)
531 {
532 const unsigned int& i1 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
533 const unsigned int& i2 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
534 hash ^= (hash << 7) ^ i1 * (hash >> 3) ^
535 (~((hash << 11) + (i2 ^ (hash >> 5))));
536 remaining_length -= 8;
537 }
538 if (remaining_length)
539 {
540 if (remaining_length >= 4)
541 {
542 const unsigned int& i = *(reinterpret_cast<const unsigned int*>(itr));
543 if (loop & 0x01)
544 hash ^= (hash << 7) ^ i * (hash >> 3);
545 else
546 hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
547 ++loop;
548 remaining_length -= 4;
549 itr += sizeof(unsigned int);
550 }
551 if (remaining_length >= 2)
552 {
553 const unsigned short& i = *(reinterpret_cast<const unsigned short*>(itr));
554 if (loop & 0x01)
555 hash ^= (hash << 7) ^ i * (hash >> 3);
556 else
557 hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
558 ++loop;
559 remaining_length -= 2;
560 itr += sizeof(unsigned short);
561 }
562 if (remaining_length)
563 {
564 hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop;
565 }
566 }
567 return hash;
568 }
569
570public:
571 std::vector<bloom_type> salt_;
572 std::vector<unsigned char> bit_table_;
573 unsigned int salt_count_;
574 unsigned long long int table_size_;
575 unsigned long long int raw_table_size_;
576 unsigned long long int projected_element_count_;
578 unsigned long long int random_seed_;
580};
581
583{
584 bloom_filter result = a;
585 result &= b;
586 return result;
587}
588
590{
591 bloom_filter result = a;
592 result |= b;
593 return result;
594}
595
597{
598 bloom_filter result = a;
599 result ^= b;
600 return result;
601}
602
603
604} // namespace fc
605
606
607FC_REFLECT( fc::bloom_filter, (salt_)(bit_table_)(salt_count_)(table_size_)(raw_table_size_)(projected_element_count_)(inserted_element_count_)(random_seed_)(desired_false_positive_probability_) )
608FC_REFLECT( fc::bloom_parameters::optimal_parameters_t, (number_of_hashes)(table_size) )
609FC_REFLECT( fc::bloom_parameters, (minimum_size)(maximum_size)(minimum_number_of_hashes)(maximum_number_of_hashes)(projected_element_count)(false_positive_probability)(random_seed)(optimal_parameters) )
610
611/*
612 Note 1:
613 If it can be guaranteed that bits_per_char will be of the form 2^n then
614 the following optimization can be used:
615
616 hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
617
618 Note 2:
619 For performance reasons where possible when allocating memory it should
620 be aligned (aligned_alloc) according to the architecture being used.
621*/
const mie::Vuint & p
Definition bn.cpp:27
void insert(const unsigned char *key_begin, const std::size_t &length)
unsigned int salt_count_
std::vector< unsigned char > bit_table_
bloom_filter(const bloom_filter &filter)
virtual unsigned long long int size() const
void insert(const std::string &key)
std::size_t element_count() const
std::vector< bloom_type > salt_
InputIterator contains_none(const InputIterator begin, const InputIterator end) const
const cell_type * table() const
InputIterator contains_all(const InputIterator begin, const InputIterator end) const
bloom_filter & operator=(const bloom_filter &f)
double effective_fpp() const
void insert(const T &t)
void insert(const char *data, const std::size_t &length)
unsigned int bloom_type
unsigned long long int table_size_
virtual void compute_indices(const bloom_type &hash, std::size_t &bit_index, std::size_t &bit) const
virtual bool contains(const unsigned char *key_begin, const std::size_t length) const
bool contains(const char *data, const std::size_t &length) const
bool operator==(const bloom_filter &f) const
bool contains(const std::string &key) const
bool contains(const T &t) const
bool operator!() const
bloom_type hash_ap(const unsigned char *begin, std::size_t remaining_length, bloom_type hash) const
void insert(const InputIterator begin, const InputIterator end)
unsigned long long int projected_element_count_
unsigned char cell_type
unsigned long long int raw_table_size_
bloom_filter(const bloom_parameters &p)
virtual ~bloom_filter()
bloom_filter & operator^=(const bloom_filter &f)
bloom_filter & operator|=(const bloom_filter &f)
std::size_t hash_count()
double desired_false_positive_probability_
bloom_filter & operator&=(const bloom_filter &f)
bool operator!=(const bloom_filter &f) const
unsigned int inserted_element_count_
unsigned long long int random_seed_
unsigned long long int maximum_size
unsigned long long int random_seed
unsigned long long int projected_element_count
optimal_parameters_t optimal_parameters
virtual bool compute_optimal_parameters()
unsigned int maximum_number_of_hashes
unsigned int minimum_number_of_hashes
unsigned long long int minimum_size
namespace sysio::chain
Definition authority.cpp:3
bloom_filter operator|(const bloom_filter &a, const bloom_filter &b)
bloom_filter operator^(const bloom_filter &a, const bloom_filter &b)
bloom_filter operator&(const bloom_filter &a, const bloom_filter &b)
Definition name.hpp:106
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition pointer.h:1181
#define T(meth, val, expected)
#define FC_REFLECT(TYPE, MEMBERS)
Specializes fc::reflector for TYPE.
Definition reflect.hpp:311
int bit
Definition yubihsm.h:566