Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ add_subdirectory(json)

set(SEARCH_LIB query_parser)

add_library(dfly_core compact_object.cc dragonfly_core.cc extent_tree.cc
add_library(dfly_core bloom.cc compact_object.cc dragonfly_core.cc extent_tree.cc
external_alloc.cc interpreter.cc mi_memory_resource.cc sds_utils.cc
segment_allocator.cc score_map.cc small_string.cc sorted_map.cc
tx_queue.cc dense_set.cc allocation_tracker.cc task_queue.cc
Expand All @@ -28,3 +28,4 @@ cxx_test(sorted_map_test dfly_core redis_test_lib LABELS DFLY)
cxx_test(bptree_set_test dfly_core LABELS DFLY)
cxx_test(score_map_test dfly_core LABELS DFLY)
cxx_test(flatbuffers_test dfly_core TRDP::flatbuffers LABELS DFLY)
cxx_test(bloom_test dfly_core LABELS DFLY)
105 changes: 105 additions & 0 deletions src/core/bloom.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright 2024, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/bloom.h"

#include <absl/base/internal/endian.h>
#include <absl/numeric/bits.h>
#include <mimalloc.h>

#include <cmath>

#define XXH_STATIC_LINKING_ONLY
#include <xxhash.h>

#include "base/logging.h"

namespace dfly {

using namespace std;

namespace {

inline XXH128_hash_t Hash(string_view str) {
return XXH3_128bits_withSeed(str.data(), str.size(), 0xc6a4a7935bd1e995ULL); // murmur2 seed
}

inline uint64_t BitIndex(const XXH128_hash_t& hash, unsigned i, uint64_t mask) {
return (hash.low64 + hash.high64 * i) % mask;
}

} // namespace

Bloom::Bloom(uint32_t entries, double error, mi_heap_t* heap) {
CHECK(error > 0 && error < 1);

if (entries < 1024)
entries = 1024;

constexpr double kDenom = M_LN2 * M_LN2;
double bpe = -log(error) / kDenom;

hash_cnt_ = ceil(M_LN2 * bpe);

uint64_t bits = uint64_t(ceil(entries * bpe));
bits = absl::bit_ceil(bits); // make it power of 2.
if (bits < 1024) {
bits = 1024;
}

uint64_t length = bits / 8;

bf_ = (uint8_t*)mi_heap_calloc(heap, length, 1);

Comment on lines +54 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just a thought, but an utuility that returns a unique_ptr on the mi-heap with a custom deleter would be helpful, we do this currently for some pointers in connection

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe but then we will increase the class size

static_assert(absl::countr_zero(8u) == 3);
bit_log_ = absl::countr_zero(bits);
DCHECK_EQ(1UL << bit_log_, bits);
}

Bloom::~Bloom() {
mi_free(bf_);
}

bool Bloom::Exists(std::string_view str) const {
XXH128_hash_t hash = Hash(str);

uint64_t mask = GetMask();
for (unsigned i = 0; i < hash_cnt_; ++i) {
uint64_t index = BitIndex(hash, i, mask);
if (!IsSet(index))
return false;
}
return true;
}

bool Bloom::Add(std::string_view str) {
XXH128_hash_t hash = Hash(str);
uint64_t mask = GetMask();

unsigned changes = 0;
for (uint64_t i = 0; i < hash_cnt_; i++) {
uint64_t index = BitIndex(hash, i, mask);
changes += Set(index);
}

return changes != 0;
}

inline bool Bloom::IsSet(size_t bit_idx) const {
uint64_t byte_idx = bit_idx / 8;
bit_idx %= 8; // index within the byte
uint8_t b = bf_[byte_idx];
return (b & (1 << bit_idx)) != 0;
}

inline bool Bloom::Set(size_t bit_idx) {
uint64_t byte_idx = bit_idx / 8;
bit_idx %= 8;

uint8_t b = bf_[byte_idx];
bf_[byte_idx] |= (1 << bit_idx);
return bf_[byte_idx] != b;
}
Comment on lines +91 to +105
Copy link
Contributor

@dranikpg dranikpg Mar 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you actually don't need all of this if you just use vector<bool> 🤔 It has proxies that allow you to reference a specific bit in its internal structure

Copy link
Collaborator Author

@romange romange Mar 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right but then I need to pass mimalloc memory resource etc. it's not a lot of code and I won't object to reducing it in the future if memory usage with vector bool won't increase.


} // namespace dfly
44 changes: 44 additions & 0 deletions src/core/bloom.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright 2024, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//

#pragma once

#include <cstdint>
#include <string_view>

typedef struct mi_heap_s mi_heap_t;

namespace dfly {

// Bloom filter based on the design of https://github.com/jvirkki/libbloom
class Bloom {
public:
// error must be in (0, 1) range.
// entries are silently rounded up to the minimum capacity.
Bloom(uint32_t entries, double error, mi_heap_t* heap);
~Bloom();

bool Exists(std::string_view str) const;

/*
* Return true if element was not present and was added,
* false - element (or a collision) had already been added previously.
*/
bool Add(std::string_view str);

private:
bool IsSet(size_t index) const;
bool Set(size_t index); // return true if bit was set (i.e was 0 before)

uint64_t GetMask() const {
return (1ULL << bit_log_) - 1;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's only called to pass it into BitIndex()

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you want to move it into cc?


uint8_t hash_cnt_;
uint8_t bit_log_;

uint8_t* bf_;
};

} // namespace dfly
61 changes: 61 additions & 0 deletions src/core/bloom_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2024, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//

#include "core/bloom.h"

#include <absl/strings/str_cat.h>
#include <gmock/gmock.h>
#include <mimalloc.h>

#include "base/gtest.h"

namespace dfly {

using namespace std;

class BloomTest : public ::testing::Test {
protected:
BloomTest() : bloom_(1000, 0.001, mi_heap_get_default()) {
}

Bloom bloom_;
};

TEST_F(BloomTest, Basic) {
EXPECT_FALSE(bloom_.Exists(string_view{}));
EXPECT_TRUE(bloom_.Add(string_view{}));
EXPECT_TRUE(bloom_.Exists(string_view{}));
EXPECT_FALSE(bloom_.Add(string_view{}));

vector<string> values;
for (unsigned i = 0; i < 100; ++i) {
values.push_back(absl::StrCat("val", i));
}

for (const auto& val : values) {
EXPECT_FALSE(bloom_.Exists(val));
EXPECT_TRUE(bloom_.Add(val));
EXPECT_TRUE(bloom_.Exists(val));
EXPECT_FALSE(bloom_.Add(val));
}
}

static void BM_BloomExist(benchmark::State& state) {
constexpr size_t kCapacity = 1U << 22;
Bloom bloom(kCapacity, 0.001, mi_heap_get_default());
for (size_t i = 0; i < kCapacity * 0.8; ++i) {
bloom.Add(absl::StrCat("val", i));
}
unsigned i = 0;
char buf[32];
memset(buf, 'x', sizeof(buf));
string_view sv{buf, sizeof(buf)};
while (state.KeepRunning()) {
absl::numbers_internal::FastIntToBuffer(i, buf);
bloom.Exists(sv);
}
}
BENCHMARK(BM_BloomExist);

} // namespace dfly