-
Notifications
You must be signed in to change notification settings - Fork 1.1k
chore: add bloom filter class #2791
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
// Copyright 2024, DragonflyDB authors. All rights reserved. | ||
// See LICENSE for licensing terms. | ||
// | ||
|
||
#include "core/bloom.h" | ||
|
||
#include <absl/base/internal/endian.h> | ||
#include <absl/numeric/bits.h> | ||
#include <mimalloc.h> | ||
|
||
#include <cmath> | ||
|
||
#define XXH_STATIC_LINKING_ONLY | ||
#include <xxhash.h> | ||
|
||
#include "base/logging.h" | ||
|
||
namespace dfly { | ||
|
||
using namespace std; | ||
|
||
namespace { | ||
|
||
inline XXH128_hash_t Hash(string_view str) { | ||
return XXH3_128bits_withSeed(str.data(), str.size(), 0xc6a4a7935bd1e995ULL); // murmur2 seed | ||
} | ||
|
||
inline uint64_t BitIndex(const XXH128_hash_t& hash, unsigned i, uint64_t mask) { | ||
return (hash.low64 + hash.high64 * i) % mask; | ||
} | ||
|
||
} // namespace | ||
|
||
Bloom::Bloom(uint32_t entries, double error, mi_heap_t* heap) { | ||
CHECK(error > 0 && error < 1); | ||
|
||
if (entries < 1024) | ||
entries = 1024; | ||
|
||
constexpr double kDenom = M_LN2 * M_LN2; | ||
double bpe = -log(error) / kDenom; | ||
|
||
hash_cnt_ = ceil(M_LN2 * bpe); | ||
|
||
uint64_t bits = uint64_t(ceil(entries * bpe)); | ||
bits = absl::bit_ceil(bits); // make it power of 2. | ||
if (bits < 1024) { | ||
bits = 1024; | ||
} | ||
|
||
uint64_t length = bits / 8; | ||
|
||
bf_ = (uint8_t*)mi_heap_calloc(heap, length, 1); | ||
|
||
static_assert(absl::countr_zero(8u) == 3); | ||
bit_log_ = absl::countr_zero(bits); | ||
DCHECK_EQ(1UL << bit_log_, bits); | ||
} | ||
|
||
Bloom::~Bloom() { | ||
mi_free(bf_); | ||
} | ||
|
||
bool Bloom::Exists(std::string_view str) const { | ||
XXH128_hash_t hash = Hash(str); | ||
|
||
uint64_t mask = GetMask(); | ||
for (unsigned i = 0; i < hash_cnt_; ++i) { | ||
uint64_t index = BitIndex(hash, i, mask); | ||
if (!IsSet(index)) | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
bool Bloom::Add(std::string_view str) { | ||
XXH128_hash_t hash = Hash(str); | ||
uint64_t mask = GetMask(); | ||
|
||
unsigned changes = 0; | ||
for (uint64_t i = 0; i < hash_cnt_; i++) { | ||
uint64_t index = BitIndex(hash, i, mask); | ||
changes += Set(index); | ||
} | ||
|
||
return changes != 0; | ||
} | ||
|
||
inline bool Bloom::IsSet(size_t bit_idx) const { | ||
uint64_t byte_idx = bit_idx / 8; | ||
bit_idx %= 8; // index within the byte | ||
uint8_t b = bf_[byte_idx]; | ||
return (b & (1 << bit_idx)) != 0; | ||
} | ||
|
||
inline bool Bloom::Set(size_t bit_idx) { | ||
uint64_t byte_idx = bit_idx / 8; | ||
bit_idx %= 8; | ||
|
||
uint8_t b = bf_[byte_idx]; | ||
bf_[byte_idx] |= (1 << bit_idx); | ||
return bf_[byte_idx] != b; | ||
} | ||
Comment on lines
+91
to
+105
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you actually don't need all of this if you just use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right but then I need to pass mimalloc memory resource etc. it's not a lot of code and I won't object to reducing it in the future if memory usage with vector bool won't increase. |
||
|
||
} // namespace dfly |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
// Copyright 2024, DragonflyDB authors. All rights reserved. | ||
// See LICENSE for licensing terms. | ||
// | ||
|
||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <string_view> | ||
|
||
typedef struct mi_heap_s mi_heap_t; | ||
|
||
namespace dfly { | ||
|
||
// Bloom filter based on the design of https://github.com/jvirkki/libbloom | ||
class Bloom { | ||
public: | ||
// error must be in (0, 1) range. | ||
// entries are silently rounded up to the minimum capacity. | ||
Bloom(uint32_t entries, double error, mi_heap_t* heap); | ||
~Bloom(); | ||
|
||
bool Exists(std::string_view str) const; | ||
|
||
/* | ||
* Return true if element was not present and was added, | ||
* false - element (or a collision) had already been added previously. | ||
*/ | ||
bool Add(std::string_view str); | ||
romange marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
private: | ||
bool IsSet(size_t index) const; | ||
bool Set(size_t index); // return true if bit was set (i.e was 0 before) | ||
|
||
uint64_t GetMask() const { | ||
return (1ULL << bit_log_) - 1; | ||
} | ||
|
||
|
||
uint8_t hash_cnt_; | ||
uint8_t bit_log_; | ||
|
||
uint8_t* bf_; | ||
}; | ||
|
||
} // namespace dfly |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
// Copyright 2024, DragonflyDB authors. All rights reserved. | ||
// See LICENSE for licensing terms. | ||
// | ||
|
||
#include "core/bloom.h" | ||
|
||
#include <absl/strings/str_cat.h> | ||
#include <gmock/gmock.h> | ||
#include <mimalloc.h> | ||
|
||
#include "base/gtest.h" | ||
|
||
namespace dfly { | ||
|
||
using namespace std; | ||
|
||
class BloomTest : public ::testing::Test { | ||
protected: | ||
BloomTest() : bloom_(1000, 0.001, mi_heap_get_default()) { | ||
} | ||
|
||
Bloom bloom_; | ||
}; | ||
|
||
TEST_F(BloomTest, Basic) { | ||
EXPECT_FALSE(bloom_.Exists(string_view{})); | ||
EXPECT_TRUE(bloom_.Add(string_view{})); | ||
EXPECT_TRUE(bloom_.Exists(string_view{})); | ||
EXPECT_FALSE(bloom_.Add(string_view{})); | ||
|
||
vector<string> values; | ||
for (unsigned i = 0; i < 100; ++i) { | ||
values.push_back(absl::StrCat("val", i)); | ||
} | ||
|
||
for (const auto& val : values) { | ||
EXPECT_FALSE(bloom_.Exists(val)); | ||
EXPECT_TRUE(bloom_.Add(val)); | ||
EXPECT_TRUE(bloom_.Exists(val)); | ||
EXPECT_FALSE(bloom_.Add(val)); | ||
} | ||
} | ||
|
||
static void BM_BloomExist(benchmark::State& state) { | ||
constexpr size_t kCapacity = 1U << 22; | ||
Bloom bloom(kCapacity, 0.001, mi_heap_get_default()); | ||
for (size_t i = 0; i < kCapacity * 0.8; ++i) { | ||
bloom.Add(absl::StrCat("val", i)); | ||
} | ||
unsigned i = 0; | ||
char buf[32]; | ||
memset(buf, 'x', sizeof(buf)); | ||
string_view sv{buf, sizeof(buf)}; | ||
while (state.KeepRunning()) { | ||
absl::numbers_internal::FastIntToBuffer(i, buf); | ||
bloom.Exists(sv); | ||
} | ||
} | ||
BENCHMARK(BM_BloomExist); | ||
|
||
} // namespace dfly |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just a thought, but an utuility that returns a unique_ptr on the mi-heap with a custom deleter would be helpful, we do this currently for some pointers in connection
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe but then we will increase the class size