From 4f810302583b801c7f8b3bc042ce49a174e54de6 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Sun, 14 Jul 2024 14:15:51 -0700 Subject: [PATCH 01/30] WIP: bloom filter. No serialization, incomplete testing --- CMakeLists.txt | 1 + LICENSE | 42 ++- common/CMakeLists.txt | 3 +- common/include/xxhash64.h | 202 ++++++++++++ filters/CMakeLists.txt | 43 +++ filters/include/bit_array.hpp | 157 ++++++++++ filters/include/bit_array_impl.hpp | 214 +++++++++++++ filters/include/bloom_filter.hpp | 432 ++++++++++++++++++++++++++ filters/include/bloom_filter_impl.hpp | 382 +++++++++++++++++++++++ filters/test/CMakeLists.txt | 45 +++ filters/test/bit_array_test.cpp | 132 ++++++++ filters/test/bloom_filter_test.cpp | 41 +++ 12 files changed, 1686 insertions(+), 8 deletions(-) create mode 100644 common/include/xxhash64.h create mode 100644 filters/CMakeLists.txt create mode 100644 filters/include/bit_array.hpp create mode 100644 filters/include/bit_array_impl.hpp create mode 100644 filters/include/bloom_filter.hpp create mode 100644 filters/include/bloom_filter_impl.hpp create mode 100644 filters/test/CMakeLists.txt create mode 100644 filters/test/bit_array_test.cpp create mode 100644 filters/test/bloom_filter_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e3505c19..056bb701 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,6 +119,7 @@ add_subdirectory(quantiles) add_subdirectory(count) add_subdirectory(density) add_subdirectory(tdigest) +add_subdirectory(filters) if (WITH_PYTHON) add_subdirectory(python) diff --git a/LICENSE b/LICENSE index d156d1cc..2a30395d 100644 --- a/LICENSE +++ b/LICENSE @@ -207,9 +207,9 @@ APPENDIX A: How to apply the Apache License to your work. APPENDIX B: Additional licenses relevant to this product. - This product includes a number of source files with code that has been - adapted from 3rd party sources including sources that may be subject - to different copyright notices and license terms. Your use of + This product includes a number of source files with code that has been + adapted from 3rd party sources including sources that may be subject + to different copyright notices and license terms. Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses. @@ -221,7 +221,7 @@ APPENDIX B: Additional licenses relevant to this product. https://github.com/catchorg/Catch2/blob/v2.x/LICENSE.txt Boost Software License - Version 1.0 - August 17th, 2003 - + Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, @@ -248,6 +248,35 @@ APPENDIX B: Additional licenses relevant to this product. Found in the Catch2 unit test code that is downloaded from github.com as part of CMake configuration if configured to build tests. + ============================================================= + MIT License + ============================================================= + + Original source: + https://github.com/stbrumme/xxhash/blob/master/LICENSE + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the Software + is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Code Location: + common/include/xxhash64.h + Original source code: + Copyright (c) 2018 Stephan Brumme + https://github.com/stbrumme/xxhash/blob/master/xxhash64.h ============================================================= Public Domain @@ -255,7 +284,7 @@ APPENDIX B: Additional licenses relevant to this product. Original source code: https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp Placed in the Public Domain by Austin Appleby - + Code Locations: common/include/MurmurHash3.h that is adapted from the above. @@ -263,8 +292,7 @@ APPENDIX B: Additional licenses relevant to this product. Original source code: * https://graphics.stanford.edu/~seander/bithacks.html * Placed in the Public Domain by Sean Eron Anderson - + Code Locations: * common/include/ceiling_power_of_2.hpp that is adapted from the above. - \ No newline at end of file diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 6de5703a..8514433b 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -31,7 +31,7 @@ target_include_directories(common install(TARGETS common EXPORT ${PROJECT_NAME}) -install(FILES +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/version.hpp include/binomial_bounds.hpp include/bounds_binomial_proportions.hpp @@ -49,4 +49,5 @@ install(FILES include/quantiles_sorted_view_impl.hpp include/quantiles_sorted_view.hpp include/serde.hpp + include/xxhash64.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/common/include/xxhash64.h b/common/include/xxhash64.h new file mode 100644 index 00000000..4d0bbc5d --- /dev/null +++ b/common/include/xxhash64.h @@ -0,0 +1,202 @@ +// ////////////////////////////////////////////////////////// +// xxhash64.h +// Copyright (c) 2016 Stephan Brumme. All rights reserved. +// see http://create.stephan-brumme.com/disclaimer.html +// + +#pragma once +#include // for uint32_t and uint64_t + +/// XXHash (64 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/ +/** How to use: + uint64_t myseed = 0; + XXHash64 myhash(myseed); + myhash.add(pointerToSomeBytes, numberOfBytes); + myhash.add(pointerToSomeMoreBytes, numberOfMoreBytes); // call add() as often as you like to ... + // and compute hash: + uint64_t result = myhash.hash(); + + // or all of the above in one single line: + uint64_t result2 = XXHash64::hash(mypointer, numBytes, myseed); + + Note: my code is NOT endian-aware ! +**/ +class XXHash64 +{ +public: + /// create new XXHash (64 bit) + /** @param seed your seed value, even zero is a valid seed **/ + explicit XXHash64(uint64_t seed) + { + state[0] = seed + Prime1 + Prime2; + state[1] = seed + Prime2; + state[2] = seed; + state[3] = seed - Prime1; + bufferSize = 0; + totalLength = 0; + } + + /// add a chunk of bytes + /** @param input pointer to a continuous block of data + @param length number of bytes + @return false if parameters are invalid / zero **/ + bool add(const void* input, uint64_t length) + { + // no data ? + if (!input || length == 0) + return false; + + totalLength += length; + // byte-wise access + const unsigned char* data = (const unsigned char*)input; + + // unprocessed old data plus new data still fit in temporary buffer ? + if (bufferSize + length < MaxBufferSize) + { + // just add new data + while (length-- > 0) + buffer[bufferSize++] = *data++; + return true; + } + + // point beyond last byte + const unsigned char* stop = data + length; + const unsigned char* stopBlock = stop - MaxBufferSize; + + // some data left from previous update ? + if (bufferSize > 0) + { + // make sure temporary buffer is full (16 bytes) + while (bufferSize < MaxBufferSize) + buffer[bufferSize++] = *data++; + + // process these 32 bytes (4x8) + process(buffer, state[0], state[1], state[2], state[3]); + } + + // copying state to local variables helps optimizer A LOT + uint64_t s0 = state[0], s1 = state[1], s2 = state[2], s3 = state[3]; + // 32 bytes at once + while (data <= stopBlock) + { + // local variables s0..s3 instead of state[0]..state[3] are much faster + process(data, s0, s1, s2, s3); + data += 32; + } + // copy back + state[0] = s0; state[1] = s1; state[2] = s2; state[3] = s3; + + // copy remainder to temporary buffer + bufferSize = stop - data; + for (uint64_t i = 0; i < bufferSize; i++) + buffer[i] = data[i]; + + // done + return true; + } + + /// get current hash + /** @return 64 bit XXHash **/ + uint64_t hash() const + { + // fold 256 bit state into one single 64 bit value + uint64_t result; + if (totalLength >= MaxBufferSize) + { + result = rotateLeft(state[0], 1) + + rotateLeft(state[1], 7) + + rotateLeft(state[2], 12) + + rotateLeft(state[3], 18); + result = (result ^ processSingle(0, state[0])) * Prime1 + Prime4; + result = (result ^ processSingle(0, state[1])) * Prime1 + Prime4; + result = (result ^ processSingle(0, state[2])) * Prime1 + Prime4; + result = (result ^ processSingle(0, state[3])) * Prime1 + Prime4; + } + else + { + // internal state wasn't set in add(), therefore original seed is still stored in state2 + result = state[2] + Prime5; + } + + result += totalLength; + + // process remaining bytes in temporary buffer + const unsigned char* data = buffer; + // point beyond last byte + const unsigned char* stop = data + bufferSize; + + // at least 8 bytes left ? => eat 8 bytes per step + for (; data + 8 <= stop; data += 8) + result = rotateLeft(result ^ processSingle(0, *(uint64_t*)data), 27) * Prime1 + Prime4; + + // 4 bytes left ? => eat those + if (data + 4 <= stop) + { + result = rotateLeft(result ^ (*(uint32_t*)data) * Prime1, 23) * Prime2 + Prime3; + data += 4; + } + + // take care of remaining 0..3 bytes, eat 1 byte per step + while (data != stop) + result = rotateLeft(result ^ (*data++) * Prime5, 11) * Prime1; + + // mix bits + result ^= result >> 33; + result *= Prime2; + result ^= result >> 29; + result *= Prime3; + result ^= result >> 32; + return result; + } + + + /// combine constructor, add() and hash() in one static function (C style) + /** @param input pointer to a continuous block of data + @param length number of bytes + @param seed your seed value, e.g. zero is a valid seed + @return 64 bit XXHash **/ + static uint64_t hash(const void* input, uint64_t length, uint64_t seed) + { + XXHash64 hasher(seed); + hasher.add(input, length); + return hasher.hash(); + } + +private: + /// magic constants :-) + static const uint64_t Prime1 = 11400714785074694791ULL; + static const uint64_t Prime2 = 14029467366897019727ULL; + static const uint64_t Prime3 = 1609587929392839161ULL; + static const uint64_t Prime4 = 9650029242287828579ULL; + static const uint64_t Prime5 = 2870177450012600261ULL; + + /// temporarily store up to 31 bytes between multiple add() calls + static const uint64_t MaxBufferSize = 31+1; + + uint64_t state[4]; + unsigned char buffer[MaxBufferSize]; + uint64_t bufferSize; + uint64_t totalLength; + + /// rotate bits, should compile to a single CPU instruction (ROL) + static inline uint64_t rotateLeft(uint64_t x, unsigned char bits) + { + return (x << bits) | (x >> (64 - bits)); + } + + /// process a single 64 bit value + static inline uint64_t processSingle(uint64_t previous, uint64_t input) + { + return rotateLeft(previous + input * Prime2, 31) * Prime1; + } + + /// process a block of 4x4 bytes, this is the main part of the XXHash32 algorithm + static inline void process(const void* data, uint64_t& state0, uint64_t& state1, uint64_t& state2, uint64_t& state3) + { + const uint64_t* block = (const uint64_t*) data; + state0 = processSingle(state0, block[0]); + state1 = processSingle(state1, block[1]); + state2 = processSingle(state2, block[2]); + state3 = processSingle(state3, block[3]); + } +}; diff --git a/filters/CMakeLists.txt b/filters/CMakeLists.txt new file mode 100644 index 00000000..5be51487 --- /dev/null +++ b/filters/CMakeLists.txt @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_library(filters INTERFACE) + +add_library(${PROJECT_NAME}::FILTER ALIAS filters) + +if (BUILD_TESTS) + add_subdirectory(test) +endif() + +target_include_directories(filters + INTERFACE + $ + $/include> +) + +target_link_libraries(filters INTERFACE common) + +install(TARGETS filters + EXPORT ${PROJECT_NAME} +) + +install(FILES + include/bloom_filter.hpp + include/bloom_filter_impl.hpp + include/bit_array.hpp + include/bit_array_impl.hpp + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/filters/include/bit_array.hpp b/filters/include/bit_array.hpp new file mode 100644 index 00000000..94daa543 --- /dev/null +++ b/filters/include/bit_array.hpp @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _BIT_ARRAY_HPP_ +#define _BIT_ARRAY_HPP_ + +#include +#include + +#include "common_defs.hpp" + +namespace datasketches { + +// forward declarations +template class bit_array_alloc; + +/// bit_array alias with default allocator +using bit_array = bit_array_alloc>; + +/** + * This class holds an array of bits suitable for use in a Bloom Filter. + * The representation is not compressed and is designed to fit in a single array + * in Java, meaning that the maximum number of bits is limited by the maximize + * size of an array of longs. + * + * For compatibility with Java, rounds the number of bits up to the smallest multiple of 64 + * (one long) that is not smaller than the specified number. + */ +template> +class bit_array_alloc { + using A = Allocator; + +public: + /** + * Construct a bit array with the given number of bits. + * @param numBits the number of bits to represent. + */ + explicit bit_array_alloc(const uint64_t num_bits, const Allocator& allocator = Allocator()); + + bool is_empty() const; + + bool is_dirty() const; + + /** + * Get the value of a bit at the given index. + * @param index the index of the bit to get + * @return the value of the bit at the given index. + */ + bool get_bit(const uint64_t index) const; + + /** + * Set the bit at the given index to 1. + * @param index the index of the bit to set. + */ + void set_bit(const uint64_t index); + + /** + * Set the bit at the given index to 0. + * @param index the index of the bit to clear. + */ + void clear_bit(const uint64_t index); + + /** + * Assign the value of the bit at the given index. + * @param index the index of the bit to set. + */ + void assign_bit(const uint64_t index, const bool value); + + /** + * Gets teh value of a bit at the specified index and sets it to true + * @poaram index the index of the bit to get and set + * @return the value of the bit at the specified index + */ + bool get_and_set_bit(const uint64_t index); + + /** + * @brief Gets the number of bits set in the bit array. + * @return the number of bits set in the bit array. + */ + uint64_t get_num_bits_set(); + + /** + * Resets the bit_aray, setting all bits to 0. + */ + void reset(); + + /** + * Gets the capacity of the bit array in bits. + * @return the capacity of the bit array in bits. + */ + uint64_t get_capacity() const; + + /** + * Performs a union operation on this bit array with another bit array. + * This operation modifies the current bit array to be the union of its original bits and the bits of the other array. + * The union operation is equivalent to a bitwise OR operation between the two arrays. + * + * @param other The other bit array to union with this one. + */ + void union_with(const bit_array_alloc& other); + + /** + * Performs an intersection operation on this bit array with another bit array. + * This operation modifies the current bit array to contain only the bits that are set in both this array and the other array. + * The intersection operation is equivalent to a bitwise AND operation between the two arrays. + * + * @param other The other bit array to intersect with this one. + */ + void intersect(const bit_array_alloc& other); + + /** + * Inverts the bits of this bit array. + * This operation modifies the current bit array by flipping all its bits; 0s become 1s and 1s become 0s. + */ + void invert(); + + /** + * Returns a string representation of the bit_array + * @return a string representation of the bit_array + */ + string to_string() const; + + /** + * @brief Get the allocator object + * + * @return Allocator + */ + Allocator get_allocator() const; + +private: + A allocator_; + uint64_t num_bits_set_; // if -1, need to recompute value + bool is_dirty_; + std::vector data_; +}; + +} // namespace datasketches + +#include "bit_array_impl.hpp" + +#endif // _BIT_ARRAY_HPP_ \ No newline at end of file diff --git a/filters/include/bit_array_impl.hpp b/filters/include/bit_array_impl.hpp new file mode 100644 index 00000000..aaef4da3 --- /dev/null +++ b/filters/include/bit_array_impl.hpp @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _BIT_ARRAY_IMPL_HPP_ +#define _BIT_ARRAY_IMPL_HPP_ + +#include +#include + +#include "common_defs.hpp" +#include "bit_array.hpp" + +namespace datasketches { + +template +bit_array_alloc::bit_array_alloc(const uint64_t num_bits, const A& allocator) : + allocator_(allocator), + num_bits_set_(0), + is_dirty_(false) + { + if (num_bits == 0) { + throw std::invalid_argument("Number of bits must be greater than zero"); + } else if (num_bits >= (((1ULL << 31) - 1) * 64)) { + throw std::invalid_argument("Bits must be representable in fewer than 2^31 64-bit values"); + } + + // round up to the nearest multiple of 64, in bytes + data_ = std::vector(((num_bits + 63) >> 6) << 3, 0, allocator); +} + +template +bool bit_array_alloc::is_empty() const { + return !is_dirty_ && num_bits_set_ == 0; +} + +template +bool bit_array_alloc::is_dirty() const { + return is_dirty_; +} + +template +bool bit_array_alloc::get_bit(const uint64_t index) const { + if (index >= data_.size() << 3) { + throw std::out_of_range("Index out of range"); + } + return (data_[index >> 3] & (1 << (index & 7))) != 0; +} + +template +void bit_array_alloc::set_bit(const uint64_t index) { + if (index >= (data_.size() << 3)) { + std::cout << "index: " << index << ", size: " << data_.size() << ", size << 3: " << (data_.size() << 3) << std::endl; + throw std::out_of_range("Index out of range"); + } + data_[index >> 3] |= (1 << (index & 7)); + is_dirty_ = true; +} + +template +void bit_array_alloc::clear_bit(const uint64_t index) { + if (index >= data_.size() << 3) { + throw std::out_of_range("Index out of range"); + } + data_[index >> 3] &= ~(1 << (index & 7)); + is_dirty_ = true; +} + +template +void bit_array_alloc::assign_bit(const uint64_t index, const bool value) { + if (value) { + set_bit(index); + } else { + clear_bit(index); + } +} + +template +bool bit_array_alloc::get_and_set_bit(const uint64_t index) { + if (index >= data_.size() << 3) { + throw std::out_of_range("Index out of range"); + } + const uint64_t offset = index >> 3; + const uint8_t mask = 1 << (index & 7); + if ((data_[offset] & mask) != 0) { + return true; + } else { + data_[offset] |= mask; + ++num_bits_set_; // increment the number of bits set regardless of is_dirty_ + return false; + } +} + +template +uint64_t bit_array_alloc::get_num_bits_set() { + if (is_dirty_) { + num_bits_set_ = 0; + + // we rounded up to a multiple of 64 so we know we can use 64-bit operations + const uint64_t* data64 = reinterpret_cast(data_.data()); + // Calculate the number of 64-bit chunks + uint64_t num_longs = data_.size() / 8; // 8 bytes per 64 bits + for (uint64_t i = 0; i < num_longs; ++i) { + // Wrap the 64-bit chunk with std::bitset for easy bit counting + std::bitset<64> bits(data64[i]); + num_bits_set_ += bits.count(); + } + is_dirty_ = false; + } + return num_bits_set_; +} + +template +uint64_t bit_array_alloc::get_capacity() const { + return data_.size() << 3; // size in bits +} + +template +void bit_array_alloc::reset() { + uint8_t* data = data_.data(); + std::fill(data, data + data_.size(), 0); + num_bits_set_ = 0; + is_dirty_ = false; +} + +template +void bit_array_alloc::union_with(const bit_array_alloc& other) { + if (data_.size() != other.data_.size()) { + throw std::invalid_argument("Cannot union bit arrays with unequal lengths"); + } + + num_bits_set_ = 0; + for (uint64_t i = 0; i < data_.size(); ++i) { + data_[i] |= other.data_[i]; + std::bitset<8> bits(data_[i]); + num_bits_set_ += bits.count(); + } + is_dirty_ = false; +} + +template +void bit_array_alloc::intersect(const bit_array_alloc& other) { + if (data_.size() != other.data_.size()) { + throw std::invalid_argument("Cannot intersect bit arrays with unequal lengths"); + } + + num_bits_set_ = 0; + for (uint64_t i = 0; i < data_.size(); ++i) { + data_[i] &= other.data_[i]; + std::bitset<8> bits(data_[i]); + num_bits_set_ += bits.count(); + } + is_dirty_ = false; +} + +template +void bit_array_alloc::invert() { + if (is_dirty_) { + num_bits_set_ = 0; + for (uint64_t i = 0; i < data_.size(); ++i) { + data_[i] = ~data_[i]; + std::bitset<8> bits(data_[i]); + num_bits_set_ += bits.count(); + } + is_dirty_ = false; + } else { + for (uint64_t i = 0; i < data_.size(); ++i) { + data_[i] = ~data_[i]; + } + num_bits_set_ = get_capacity() - num_bits_set_; + } +} + +template +A bit_array_alloc::get_allocator() const { + return allocator_; +} + +template +string bit_array_alloc::to_string() const { + std::ostringstream oss; + uint64_t num_blocks = data_.size() / 8; // groups of 64 bits + for (uint64_t i = 0; i < num_blocks; ++i) { + oss << i << ": "; + for (uint64_t j = 0; j < 8; ++j) { // bytes w/in a block + for (uint64_t b = 0; b < 8; ++b) { // bits w/in a byte + oss << ((data_[i * 8 + j] & (1 << b)) ? "1" : "0"); + } + oss << " "; + } + oss << std::endl; + } + oss << std::endl; + return oss.str(); +} + +} // namespace datasketches + +#endif // _BIT_ARRAY_IMPL_HPP_ \ No newline at end of file diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp new file mode 100644 index 00000000..f81ed688 --- /dev/null +++ b/filters/include/bloom_filter.hpp @@ -0,0 +1,432 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _BLOOM_FILTER_HPP_ +#define _BLOOM_FILTER_HPP_ + +#include +#include + +#include "bit_array.hpp" +#include "common_defs.hpp" + +namespace datasketches { + +// forward declarations +template class bloom_filter_alloc; + +/// bit_array alias with default allocator +using bloom_filter = bloom_filter_alloc>; + + +template> +class bloom_filter_alloc { + using A = Allocator; + +public: + + bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const Allocator& allocator = Allocator()); + + /** + * Checks if the Bloom Filter has processed any items + * @return True if the BloomFilter is empty, otherwise False + */ + bool is_empty() const; + + /** + * Returns the number of bits in the Bloom Filter that are set to 1. + * @return The number of bits in use in this filter + */ + uint64_t get_bits_used() const; + + /** + * Returns the total number of bits in the Bloom Filter. + * @return The total size of the Bloom Filter + */ + uint64_t get_capacity() const; + + /** + * Returns the configured number of hash functions for this Bloom Filter + * @return The number of hash functions to apply to inputs + */ + uint16_t get_num_hashes() const; + + /** + * Returns the hash seed for this Bloom Filter. + * @return The hash seed for this filter + */ + uint64_t get_seed() const; + + /** + * Resets the Bloom Filter to its original state. + */ + void reset(); + + // UPDATE METHODS + + /** + * Updates the filter with the given std::string. + * The string is converted to a byte array using UTF8 encoding. + * If the string is null or empty no update attempt is made and the method returns. + * @param item The given string. + */ + void update(const std::string& item); + + /** + * Updates the filter with the given unsigned 64-bit integer. + * @param item The given integer. + */ + void update(uint64_t item); + + /** + * Updates the filter with the given unsigned 32-bit integer. + * @param item The given integer. + */ + void update(uint32_t item); + + /** + * Updates the filter with the given unsigned 16-bit integer. + * @param item The given integer. + */ + void update(uint16_t item); + + /** + * Updates the filter with the given unsigned 8-bit integer. + * @param item The given integer. + */ + void update(uint8_t item); + + /** + * Updates the filter with the given signed 64-bit integer. + * @param item The given integer. + */ + void update(int64_t item); + + /** + * Updates the filter with the given signed 32-bit integer. + * @param item The given integer. + */ + void update(int32_t item); + + /** + * Updates the filter with the given signed 16-bit integer. + * @param item The given integer. + */ + void update(int16_t item); + + /** + * Updates the filter with the given signed 8-bit integer. + * @param item The given integer. + */ + void update(int8_t item); + + /** + * Updates the filter with the given 64-bit floating point value. + * @param item The given double. + */ + void update(double item); + + /** + * Updates the filter with the give 32-bit floating point value. + * @param item The given float. + */ + void update(float item); + + /** + * Updates the filter with the given data array. + * @param data The given array. + * @param length_bytes The array length in bytes. + */ + void update(const void* data, size_t length_bytes); + + // QUERY-AND-UPDATE METHODS + + /** + * Updates the filter with the given std::string and returns the result from + * querying the filter prior to the update. + * The string is converted to a byte array using UTF8 encoding. + * If the string is null or empty no update attempt is made and the method returns false. + * @param item The given string. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(const std::string& item); + + /** + * Updates the filter with the given unsigned 64-bit integer and returns the result from + * querying the filter prior to the update. + * @param item The given integer. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(uint64_t item); + + /** + * Updates the filter with the given unsigned 32-bit integer and returns the result from + * querying the filter prior to the update. + * @param item The given integer. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(uint32_t item); + + /** + * Updates the filter with the given unsigned 16-bit integer and returns the result from + * querying the filter prior to the update. + * @param item The given integer. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(uint16_t item); + + /** + * Updates the filter with the given unsigned 8-bit integer and returns the result from + * querying the filter prior to the update. + * @param item The given integer. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(uint8_t item); + + /** + * Updates the filter with the given signed 64-bit integer and returns the result from + * querying the filter prior to the update. + * @param item The given integer. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(int64_t item); + + /** + * Updates the filter with the given signed 32-bit integer and returns the result from + * querying the filter prior to the update. + * @param item The given integer. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(int32_t item); + + /** + * Updates the filter with the given signed 16-bit integer and returns the result from + * querying the filter prior to the update. + * @param item The given integer. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(int16_t item); + + /** + * Updates the filter with the given signed 8-bit integer and returns the result from + * querying the filter prior to the update. + * @param item The given integer. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(int8_t item); + + /** + * Updates the filter with the given 64-bit floating point value and returns the result from + * querying the filter prior to the update. + * @param item The given double. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(double item); + + /** + * Updates the filter with the give 32-bit floating point value and returns the result from + * querying the filter prior to the update. + * @param item The given float. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(float item); + + /** + * Updates the filter with the given data array and returns the result from + * querying the filter prior to the update. + * @param data The given array. + * @param length_bytes The array length in bytes. + * @return The result from querying the filter prior to the update. + */ + bool query_and_update(const void* data, size_t length_bytes); + + // QUERY METHODS + + /** + * Queries the filter with the given std::string and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * The string is converted to a byte array using UTF8 encoding. + * If the string is null or empty the method always returns false. + * @param item The given string. + * @return The result from querying the filter with the given item. + */ + bool query(const std::string& item) const; + + /** + * Queries the filter with the given unsigned 64-bit integer and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given integer. + * @return The result from querying the filter with the given item. + */ + bool query(uint64_t item) const; + + /** + * Queries the filter with the given unsigned 32-bit integer and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given integer. + * @return The result from querying the filter with the given item. + */ + bool query(uint32_t item) const; + + /** + * Queries the filter with the given unsigned 16-bit integer and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given integer. + * @return The result from querying the filter with the given item. + */ + bool query(uint16_t item) const; + + /** + * Queries the filter with the given unsigned 8-bit integer and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given integer. + * @return The result from querying the filter with the given item. + */ + bool query(uint8_t item) const; + + /** + * Queries the filter with the given signed 64-bit integer and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given integer. + * @return The result from querying the filter with the given item. + */ + bool query(int64_t item) const; + + /** + * Queries the filter with the given signed 32-bit integer and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given integer. + * @return The result from querying the filter with the given item. + */ + bool query(int32_t item) const; + + /** + * Queries the filter with the given signed 16-bit integer and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given integer. + * @return The result from querying the filter with the given item. + */ + bool query(int16_t item) const; + + /** + * Queries the filter with the given signed 8-bit integer and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given integer. + * @return The result from querying the filter with the given item. + */ + bool query(int8_t item) const; + + /** + * Queries the filter with the given 64-bit floating point value and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given double. + * @return The result from querying the filter with the given item. + */ + bool query(double item) const; + + /** + * Queries the filter with the given 32-bit floating point value and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param item The given float. + * @return The result from querying the filter with the given item. + */ + bool query(float item) const; + + /** + * Queries the filter with the given data array. and returns the result from + * Queries the filter with the given 64-bit floating point value and returns whether the value + * might have been seen previoiusly. The filter's expected Fale Positive Probability + * determines the chances of a true result being a false positive. False engatives are + * never possible. + * @param data The given array. + * @param length_bytes The array length in bytes. + * @return The result from querying the filter with the given item. + */ + bool query(const void* data, size_t length_bytes) const; + + // OTHER OPERATIONS + + /** + * Unions two Bloom Filters by applying a logical OR. The result will recognized + * any values seen by either filter (as well as false positives). + * @param other A BloomFilter to union with this one + */ + void union_with(const bloom_filter_alloc& other); + + /** + * Intersects two Bloom Filters by applying a logical AND. The result will recognize + * only values seen by both filters (as well as false positives). + * @param other A Bloom Filter to union with this one + */ + void intersect(const bloom_filter_alloc& other); + + /** + * Inverts all the bits of the BloomFilter. Approximately inverts the notion of set-membership. + */ + void invert(); + + /** + * Helps identify if two Bloom Filters may be unioned or intersected. + * @param other A Bloom Filter to check for compatibility with this one + * @return True if the filters are compatible, otherwise false + */ + bool is_compatible(const bloom_filter_alloc& other) const; + + // TODO: Serialization + + +private: + // internal query/update methods + void internal_update(const uint64_t h0, const uint64_t h1); + bool internal_query_and_update(const uint64_t h0, const uint64_t h1); + bool internal_query(const uint64_t h0, const uint64_t h1) const; + + A allocator_; + uint64_t seed_; + uint16_t num_hashes_; + bit_array_alloc bit_array_; +}; + +} // namespace datasketches + +#include "bloom_filter_impl.hpp" + +#endif // _BLOOM_FILTER_HPP_ b diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp new file mode 100644 index 00000000..43f50fad --- /dev/null +++ b/filters/include/bloom_filter_impl.hpp @@ -0,0 +1,382 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _BLOOM_FILTER_IMPL_HPP_ +#define _BLOOM_FILTER_IMPL_HPP_ + +#include +#include + +#include "common_defs.hpp" +#include "xxhash64.h" + +namespace datasketches { + +template +bloom_filter_alloc::bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, uint64_t seed, const A& allocator) : + allocator_(allocator), + seed_(seed), + num_hashes_(num_hashes), + bit_array_(num_bits, allocator) + {} + +template +bool bloom_filter_alloc::is_empty() const { + return bit_array_.is_empty(); +} + +template +uint64_t bloom_filter_alloc::get_bits_used() const { + return bit_array_.get_num_bits_set(); +} + +template +uint64_t bloom_filter_alloc::get_capacity() const { + return bit_array_.get_capacity(); +} + +template +uint16_t bloom_filter_alloc::get_num_hashes() const { + return num_hashes_; +} + +template +uint64_t bloom_filter_alloc::get_seed() const { + return seed_; +} + +template +void bloom_filter_alloc::reset() { + bit_array_.reset(); +} + +// UPDATE METHODS + +template +void bloom_filter_alloc::update(const std::string& item) { + if (item.empty()) return; + const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_); + const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0); + internal_update(h0, h1); +} + +template +void bloom_filter_alloc::update(const uint64_t item) { + const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); + const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); + internal_update(h0, h1); +} + +template +void bloom_filter_alloc::update(const uint32_t item) { + update(static_cast(item)); +} + +template +void bloom_filter_alloc::update(const uint16_t item) { + update(static_cast(item)); +} + +template +void bloom_filter_alloc::update(const uint8_t item) { + update(static_cast(item)); +} + +template +void bloom_filter_alloc::update(const int64_t item) { + const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); + const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); + internal_update(h0, h1); +} + +template +void bloom_filter_alloc::update(const int32_t item) { + update(static_cast(item)); +} + +template +void bloom_filter_alloc::update(const int16_t item) { + update(static_cast(item)); +} + +template +void bloom_filter_alloc::update(const int8_t item) { + update(static_cast(item)); +} + +template +void bloom_filter_alloc::update(const double item) { + union { + int64_t long_value; + double double_value; + } ldu; + ldu.doubleBytes = static_cast(item); + if (item == 0.0) { + ldu.doubleBytes = 0.0; // canonicalize -0.0 to 0.0 + } else if (std::isnan(ldu.doubleBytes)) { + ldu.longBytes = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() + } + const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_); + const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0); + internal_update(h0, h1); +} + +template +void bloom_filter_alloc::update(const float item) { + update(static_cast(item)); +} + +template +void bloom_filter_alloc::update(const void* item, size_t size) { + if (item == nullptr || size == 0) return; + const uint64_t h0 = XXHash64::hash(item, size, seed_); + const uint64_t h1 = XXHash64::hash(item, size, h0); + internal_update(h0, h1); +} + +template +void bloom_filter_alloc::internal_update(const uint64_t h0, const uint64_t h1) { + const uint64_t num_bits = bit_array_.get_capacity(); + for (uint16_t i = 1; i <= num_hashes_; i++) { + const uint64_t hash_index = (h0 + i * h1; >> 1) % num_bits; + bit_array_.set_bit(bit_index); + } +} + +// QUERY-AND-UPDATE METHODS + +template +bool bloom_filter_alloc::query_and_update(const std::string& item) { + if (item.empty()) return; + const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_); + const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0); + return internal_query_and_update(h0, h1); +} + +template +bool bloom_filter_alloc::query_and_update(const uint64_t item) { + const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); + const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); + return internal_query_and_update(h0, h1); +} + +template +bool bloom_filter_alloc::query_and_update(const uint32_t item) { + return query_and_update(static_cast(item)); +} + +template +bool bloom_filter_alloc::query_and_update(const uint16_t item) { + return query_and_update(static_cast(item)); +} + +template +bool bloom_filter_alloc::query_and_update(const uint8_t item) { + return query_and_update(static_cast(item)); +} + +template +bool bloom_filter_alloc::query_and_update(const int64_t item) { + const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); + const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); + return internal_query_and_update(h0, h1); +} + +template +bool bloom_filter_alloc::query_and_update(const int32_t item) { + return query_and_update(static_cast(item)); +} + +template +bool bloom_filter_alloc::query_and_update(const int16_t item) { + return query_and_update(static_cast(item)); +} + +template +bool bloom_filter_alloc::query_and_update(const int8_t item) { + return query_and_update(static_cast(item)); +} + +template +bool bloom_filter_alloc::query_and_update(const double item) { + union { + int64_t long_value; + double double_value; + } ldu; + ldu.doubleBytes = item; + if (item == 0.0) { + ldu.doubleBytes = 0.0; // canonicalize -0.0 to 0.0 + } else if (std::isnan(ldu.doubleBytes)) { + ldu.longBytes = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() + } + const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_); + const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0); + return internal_query_and_update(h0, h1); +} + +template +bool bloom_filter_alloc::query_and_update(const float item) { + return query_and_update(static_cast(item)); +} + +template +bool bloom_filter_alloc::query_and_update(const void* item, size_t size) { + if (item == nullptr || size == 0) return; + const uint64_t h0 = XXHash64::hash(item, size, seed_); + const uint64_t h1 = XXHash64::hash(item, size, h0); + return internal_query_and_update(h0, h1); +} + +template +bool bloom_filter_alloc::internal_query_and_update(const uint64_t h0, const uint64_t h1) { + const uint64_t num_bits = bit_array_.get_capacity(); + bool value_exists = true; + for (uint16_t i = 0; i < num_hashes_; i++) { + const uint64_t hash_index = (h0 + i * h1; >> 1) % num_bits; + value_exists &= bit_array_.get_and_set_bit(bit_index); + } + return value_exists; +} + +// QUERY METHODS + +template +bool bloom_filter_alloc::query(const std::string& item) const { + if (item.empty()) return; + const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_); + const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0); + return internal_query(h0, h1); +} + +template +bool bloom_filter_alloc::query(const uint64_t item) const { + const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); + const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); + return internal_query(h0, h1); +} + +template +bool bloom_filter_alloc::query(const uint32_t item) const { + return query(static_cast(item)); +} + +template +bool bloom_filter_alloc::query(const uint16_t item) const { + return query(static_cast(item)); +} + +template +bool bloom_filter_alloc::query(const uint8_t item) const { + return query(static_cast(item)); +} + +template +bool bloom_filter_alloc::query(const int64_t item) const { + const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); + const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); + return internal_query(h0, h1); +} + +template +bool bloom_filter_alloc::query(const int32_t item) const { + return query(static_cast(item)); +} + +template +bool bloom_filter_alloc::query(const int16_t item) const { + return query(static_cast(item)); +} + +template +bool bloom_filter_alloc::query(const int8_t item) const { + return query(static_cast(item)); +} + +template +bool bloom_filter_alloc::query(const double item) const { + union { + int64_t long_value; + double double_value; + } ldu; + ldu.doubleBytes = static_cast(item); + if (item == 0.0) { + ldu.doubleBytes = 0.0; // canonicalize -0.0 to 0.0 + } else if (std::isnan(ldu.doubleBytes)) { + ldu.longBytes = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() + } + const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_); + const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0); + return internal_query(h0, h1); +} + +template +bool bloom_filter_alloc::query(const float item) const { + return query(static_cast(item)); +} + +template +bool bloom_filter_alloc::query(const void* item, size_t size) const { + if (item == nullptr || size == 0) return; + const uint64_t h0 = XXHash64::hash(item, size, seed_); + const uint64_t h1 = XXHash64::hash(item, size, h0); + return internal_query(h0, h1); +} + +template +bool bloom_filter_alloc::internal_query(const uint64_t h0, const uint64_t h1) const { + const uint64_t num_bits = bit_array_.get_capacity(); + for (uint16_t i = 0; i < num_hashes_; i++) { + const uint64_t hash_index = (h0 + i * h1; >> 1) % num_bits; + if (!bit_array_.get_bit(bit_index)) + return false; + } + return true; +} + +// OTHER METHODS + +template +bool bloom_filter_alloc::is_compatible(const bloom_filter_alloc& other) const { + return seed_ == other.seed_ && num_hashes_ == other.num_hashes_ && bit_array_.get_capacity() == other.bit_array_.get_capacity(); +} + +template +void bloom_filter_alloc::union_with(const bloom_filter_alloc& other) { + if (!is_compatible(other)) { + throw std::invalid_argument("Incompatible bloom filters"); + } + bit_array_.union_with(other.bit_array_); +} + +template +void bloom_filter_alloc::intersect(const bloom_filter_alloc& other) { + if (!is_compatible(other)) { + throw std::invalid_argument("Incompatible bloom filters"); + } + bit_array_.intersect(other.bit_array_); +} + +template +void bloom_filter_alloc::invert() { + bit_array_.invert(); +} + +} // namespace datasketches + +#endif // _BLOOM_FILTER_IMPL_HPP_ \ No newline at end of file diff --git a/filters/test/CMakeLists.txt b/filters/test/CMakeLists.txt new file mode 100644 index 00000000..9fa7f5de --- /dev/null +++ b/filters/test/CMakeLists.txt @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# separate executables for var_opt and ebpps sampling + +# BLOOM FILTER +add_executable(bloom_filter_test) + +target_link_libraries(bloom_filter_test filters common_test_lib) + +set_target_properties(bloom_filter_test PROPERTIES + CXX_STANDARD_REQUIRED YES +) + +file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" FILTERS_TEST_BINARY_PATH) +string(APPEND FILTERS_TEST_BINARY_PATH "/") +target_compile_definitions(bloom_filter_test + PRIVATE + TEST_BINARY_INPUT_PATH="${FILTERS_TEST_BINARY_PATH}" +) + +add_test( + NAME bloom_filter_test + COMMAND bloom_filter_test +) + +target_sources(bloom_filter_test + PRIVATE + bit_array_test.cpp + bloom_filter_test.cpp +) diff --git a/filters/test/bit_array_test.cpp b/filters/test/bit_array_test.cpp new file mode 100644 index 00000000..a3760beb --- /dev/null +++ b/filters/test/bit_array_test.cpp @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +//#include +//#include +//#include + +#include "bit_array.hpp" + +#ifdef TEST_BINARY_INPUT_PATH +static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH; +#else +static std::string testBinaryInputPath = "test/"; +#endif + +namespace datasketches { + +TEST_CASE("bit_array: invalid num_bits", "[bit_array]") { + REQUIRE_THROWS_AS(bit_array(0), std::invalid_argument); + REQUIRE_THROWS_AS(bit_array(1L << 60), std::invalid_argument); +} + +TEST_CASE("bit_array: construction", "[bit_array]") { + bit_array ba(64); + REQUIRE(ba.get_capacity() == 64); + REQUIRE(ba.get_num_bits_set() == 0); + REQUIRE(ba.is_empty()); + REQUIRE(!ba.is_dirty()); +} + +TEST_CASE("bit_array: basic operation", "[bit_array]") { + bit_array ba(128); + REQUIRE(ba.get_and_set_bit(1) == false); + REQUIRE(ba.get_and_set_bit(2) == false); + for (int i = 4; i < 64; i <<= 1) { + REQUIRE(ba.get_and_set_bit(64 + i) == false); + } + + REQUIRE(ba.get_num_bits_set() == 6); + REQUIRE(ba.get_bit(68)); + REQUIRE(!ba.is_empty()); + + REQUIRE(ba.get_bit(5) == false); + ba.set_bit(5); + REQUIRE(ba.get_and_set_bit(5)); + REQUIRE(ba.get_num_bits_set() == 7); + + ba.clear_bit(5); + REQUIRE(ba.get_bit(5) == false); + REQUIRE(ba.get_num_bits_set() == 6); + + ba.reset(); + REQUIRE(ba.is_empty()); + REQUIRE(ba.get_num_bits_set() == 0); + + ba.set_bit(35); + REQUIRE(ba.get_and_set_bit(35)); + ba.assign_bit(35, false); + REQUIRE(ba.get_bit(35) == false); + ba.assign_bit(35, true); + REQUIRE(ba.get_bit(35)); + + REQUIRE(ba.to_string().length() > 0); +} + +TEST_CASE("bit_array: inversion", "[bit_array]") { + size_t num_bits = 1024; + bit_array ba(num_bits); + for (size_t i = 0; i < num_bits; i += num_bits / 8) { + ba.get_and_set_bit(i); + } + REQUIRE(ba.get_bit(0)); + + size_t num_bits_set = ba.get_num_bits_set(); + ba.invert(); + REQUIRE(ba.get_num_bits_set() == num_bits - num_bits_set); + REQUIRE(ba.get_bit(0) == false); + + // update to make dirty and invert again + ba.set_bit(0); + ba.invert(); + REQUIRE(ba.get_num_bits_set() == num_bits_set - 1); + REQUIRE(ba.get_bit(0) == false); +} + +TEST_CASE("bit_array: invalid union and intersection", "[bit_array]") { + bit_array ba1(64); + bit_array ba2(128); + REQUIRE_THROWS_AS(ba1.union_with(ba2), std::invalid_argument); + REQUIRE_THROWS_AS(ba1.intersect(ba2), std::invalid_argument); +} + +TEST_CASE("bit_array: intersection and union", "[bit_array]") { + bit_array ba1(64); + bit_array ba2(64); + bit_array ba3(64); + + size_t n = 10; + for (size_t i = 0; i < n; ++i) { + ba1.get_and_set_bit(i); + ba2.get_and_set_bit(i + (n / 2)); + ba3.get_and_set_bit(2 * i); + } + REQUIRE(ba1.get_num_bits_set() == n); + REQUIRE(ba2.get_num_bits_set() == n); + REQUIRE(ba3.get_num_bits_set() == n); + + ba1.intersect(ba2); + REQUIRE(ba1.get_num_bits_set() == n / 2); + + ba3.union_with(ba2); + REQUIRE(ba3.get_num_bits_set() == 3 * n / 2); +} + +} // namespace datasketches diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp new file mode 100644 index 00000000..a16f29f2 --- /dev/null +++ b/filters/test/bloom_filter_test.cpp @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +//#include +//#include +//#include + +#include "bloom_filter.hpp" + +#ifdef TEST_BINARY_INPUT_PATH +static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH; +#else +static std::string testBinaryInputPath = "test/"; +#endif + +namespace datasketches { + +TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") { + REQUIRE_THROWS_AS(bloom_filter(0, 4, DEFAULT_SEED), std::invalid_argument); + REQUIRE_THROWS_AS(bloom_filter(1L << 60, 4, DEFAULT_SEED), std::invalid_argument); + REQUIRE_THROWS_AS(bloom_filter(65535, 0, DEFAULT_SEED), std::invalid_argument); +} + +} // namespace datasketches From 46e236cd978c4691280ea7949a48165afd0ac88d Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:17:07 -0700 Subject: [PATCH 02/30] bug fixes, but still WIP --- filters/include/bloom_filter_impl.hpp | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 43f50fad..162a4baa 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -34,7 +34,11 @@ bloom_filter_alloc::bloom_filter_alloc(const uint64_t num_bits, const uint16_ seed_(seed), num_hashes_(num_hashes), bit_array_(num_bits, allocator) - {} + { + if (num_hashes == 0) { + throw std::invalid_argument("Must have at least 1 hash function"); + } + } template bool bloom_filter_alloc::is_empty() const { @@ -154,8 +158,8 @@ template void bloom_filter_alloc::internal_update(const uint64_t h0, const uint64_t h1) { const uint64_t num_bits = bit_array_.get_capacity(); for (uint16_t i = 1; i <= num_hashes_; i++) { - const uint64_t hash_index = (h0 + i * h1; >> 1) % num_bits; - bit_array_.set_bit(bit_index); + const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; + bit_array_.set_bit(hash_index); } } @@ -163,7 +167,7 @@ void bloom_filter_alloc::internal_update(const uint64_t h0, const uint64_t h1 template bool bloom_filter_alloc::query_and_update(const std::string& item) { - if (item.empty()) return; + if (item.empty()) return false; const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_); const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0); return internal_query_and_update(h0, h1); @@ -237,7 +241,7 @@ bool bloom_filter_alloc::query_and_update(const float item) { template bool bloom_filter_alloc::query_and_update(const void* item, size_t size) { - if (item == nullptr || size == 0) return; + if (item == nullptr || size == 0) return false; const uint64_t h0 = XXHash64::hash(item, size, seed_); const uint64_t h1 = XXHash64::hash(item, size, h0); return internal_query_and_update(h0, h1); @@ -248,8 +252,8 @@ bool bloom_filter_alloc::internal_query_and_update(const uint64_t h0, const u const uint64_t num_bits = bit_array_.get_capacity(); bool value_exists = true; for (uint16_t i = 0; i < num_hashes_; i++) { - const uint64_t hash_index = (h0 + i * h1; >> 1) % num_bits; - value_exists &= bit_array_.get_and_set_bit(bit_index); + const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; + value_exists &= bit_array_.get_and_set_bit(hash_index); } return value_exists; } @@ -258,7 +262,7 @@ bool bloom_filter_alloc::internal_query_and_update(const uint64_t h0, const u template bool bloom_filter_alloc::query(const std::string& item) const { - if (item.empty()) return; + if (item.empty()) return false; const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_); const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0); return internal_query(h0, h1); @@ -332,7 +336,7 @@ bool bloom_filter_alloc::query(const float item) const { template bool bloom_filter_alloc::query(const void* item, size_t size) const { - if (item == nullptr || size == 0) return; + if (item == nullptr || size == 0) return false; const uint64_t h0 = XXHash64::hash(item, size, seed_); const uint64_t h1 = XXHash64::hash(item, size, h0); return internal_query(h0, h1); @@ -342,8 +346,8 @@ template bool bloom_filter_alloc::internal_query(const uint64_t h0, const uint64_t h1) const { const uint64_t num_bits = bit_array_.get_capacity(); for (uint16_t i = 0; i < num_hashes_; i++) { - const uint64_t hash_index = (h0 + i * h1; >> 1) % num_bits; - if (!bit_array_.get_bit(bit_index)) + const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; + if (!bit_array_.get_bit(hash_index)) return false; } return true; From 2a103ce12dffd758f8c68dfafb1010337b1a3fd9 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:45:30 -0700 Subject: [PATCH 03/30] add missing header --- filters/include/bit_array_impl.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/filters/include/bit_array_impl.hpp b/filters/include/bit_array_impl.hpp index aaef4da3..7ab939cd 100644 --- a/filters/include/bit_array_impl.hpp +++ b/filters/include/bit_array_impl.hpp @@ -22,6 +22,7 @@ #include #include +#include #include "common_defs.hpp" #include "bit_array.hpp" From 30aa213eda34972fc3a925101b5c6e98276b4790 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:50:48 -0700 Subject: [PATCH 04/30] fix included header --- filters/include/bit_array_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filters/include/bit_array_impl.hpp b/filters/include/bit_array_impl.hpp index 7ab939cd..49dcb313 100644 --- a/filters/include/bit_array_impl.hpp +++ b/filters/include/bit_array_impl.hpp @@ -22,7 +22,7 @@ #include #include -#include +#include #include "common_defs.hpp" #include "bit_array.hpp" From e8c14d15e9e527aa5ca2a23e01a33dd58745ddcc Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 23 Jul 2024 11:14:21 -0700 Subject: [PATCH 05/30] Add builder methods, improve test coverage --- filters/CMakeLists.txt | 1 + filters/include/bit_array.hpp | 8 + filters/include/bit_array_impl.hpp | 35 ++-- filters/include/bloom_filter.hpp | 46 ++++- filters/include/bloom_filter_builder_impl.hpp | 97 ++++++++++ filters/include/bloom_filter_impl.hpp | 44 +++-- filters/test/bloom_filter_test.cpp | 175 +++++++++++++++++- 7 files changed, 377 insertions(+), 29 deletions(-) create mode 100644 filters/include/bloom_filter_builder_impl.hpp diff --git a/filters/CMakeLists.txt b/filters/CMakeLists.txt index 5be51487..2ded3ed3 100644 --- a/filters/CMakeLists.txt +++ b/filters/CMakeLists.txt @@ -38,6 +38,7 @@ install(TARGETS filters install(FILES include/bloom_filter.hpp include/bloom_filter_impl.hpp + include/bloom_filter_builder_impl.hpp include/bit_array.hpp include/bit_array_impl.hpp DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/filters/include/bit_array.hpp b/filters/include/bit_array.hpp index 94daa543..78c088bb 100644 --- a/filters/include/bit_array.hpp +++ b/filters/include/bit_array.hpp @@ -89,6 +89,12 @@ class bit_array_alloc { */ bool get_and_set_bit(const uint64_t index); + /** + * @brief Gets the number of bits set in the bit array. + * @return the number of bits set in the bit array. + */ + uint64_t get_num_bits_set() const; + /** * @brief Gets the number of bits set in the bit array. * @return the number of bits set in the bit array. @@ -148,6 +154,8 @@ class bit_array_alloc { uint64_t num_bits_set_; // if -1, need to recompute value bool is_dirty_; std::vector data_; + + uint64_t count_bits_set() const; }; } // namespace datasketches diff --git a/filters/include/bit_array_impl.hpp b/filters/include/bit_array_impl.hpp index 49dcb313..41ae9f64 100644 --- a/filters/include/bit_array_impl.hpp +++ b/filters/include/bit_array_impl.hpp @@ -107,21 +107,34 @@ bool bit_array_alloc::get_and_set_bit(const uint64_t index) { } } +template +uint64_t bit_array_alloc::count_bits_set() const { + uint64_t num_bits_set = 0; + + // we rounded up to a multiple of 64 so we know we can use 64-bit operations + const uint64_t* data64 = reinterpret_cast(data_.data()); + // Calculate the number of 64-bit chunks + uint64_t num_longs = data_.size() / 8; // 8 bytes per 64 bits + for (uint64_t i = 0; i < num_longs; ++i) { + // Wrap the 64-bit chunk with std::bitset for easy bit counting + std::bitset<64> bits(data64[i]); + num_bits_set += bits.count(); + } + return num_bits_set; +} + template uint64_t bit_array_alloc::get_num_bits_set() { if (is_dirty_) { - num_bits_set_ = 0; + num_bits_set_ = count_bits_set(); + } + return num_bits_set_; +} - // we rounded up to a multiple of 64 so we know we can use 64-bit operations - const uint64_t* data64 = reinterpret_cast(data_.data()); - // Calculate the number of 64-bit chunks - uint64_t num_longs = data_.size() / 8; // 8 bytes per 64 bits - for (uint64_t i = 0; i < num_longs; ++i) { - // Wrap the 64-bit chunk with std::bitset for easy bit counting - std::bitset<64> bits(data64[i]); - num_bits_set_ += bits.count(); - } - is_dirty_ = false; +template +uint64_t bit_array_alloc::get_num_bits_set() const { + if (is_dirty_) { + return count_bits_set(); } return num_bits_set_; } diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index f81ed688..e9076c81 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -30,10 +30,38 @@ namespace datasketches { // forward declarations template class bloom_filter_alloc; +template class bloom_filter_builder_alloc; -/// bit_array alias with default allocator +// aliases with default allocator using bloom_filter = bloom_filter_alloc>; +using bloom_filter_builder = bloom_filter_builder_alloc>; +template> +class bloom_filter_builder_alloc { + using A = Allocator; + +public: + static uint16_t suggest_num_hashes(const uint64_t num_distinct_items, const uint64_t num_filter_bits); + static uint16_t suggest_num_hashes(const double target_false_positive_prob); + static uint64_t suggest_num_filter_bits(const uint64_t num_distinct_items, const double target_false_positive_prob); + + static bloom_filter_alloc create_by_accuracy(const uint64_t num_distinct_items, + const double target_false_positive_prob, + const Allocator& allocator = Allocator()); + static bloom_filter_alloc create_by_accuracy(const uint64_t num_distinct_items, + const double target_false_positive_prob, + const uint64_t seed, + const Allocator& allocator = Allocator()); + + static bloom_filter_alloc create_by_size(const uint64_t num_bits, + const uint16_t num_hashes, + const Allocator& allocator = Allocator()); + static bloom_filter_alloc create_by_size(const uint64_t num_bits, + const uint16_t num_hashes, + const uint64_t seed, + const Allocator& allocator = Allocator()); + +}; template> class bloom_filter_alloc { @@ -41,8 +69,6 @@ class bloom_filter_alloc { public: - bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const Allocator& allocator = Allocator()); - /** * Checks if the Bloom Filter has processed any items * @return True if the BloomFilter is empty, otherwise False @@ -53,7 +79,7 @@ class bloom_filter_alloc { * Returns the number of bits in the Bloom Filter that are set to 1. * @return The number of bits in use in this filter */ - uint64_t get_bits_used() const; + uint64_t get_bits_used(); /** * Returns the total number of bits in the Bloom Filter. @@ -412,8 +438,17 @@ class bloom_filter_alloc { // TODO: Serialization + /** + * @brief Returns a human-readable string representation of the Bloom Filter. + * @param print_filter If true, the filter bits will be printed as well. + * @return A human-readable string representation of the Bloom Filter. + */ + string to_string(bool print_filter = false) const; + private: + bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const Allocator& allocator = Allocator()); + // internal query/update methods void internal_update(const uint64_t h0, const uint64_t h1); bool internal_query_and_update(const uint64_t h0, const uint64_t h1); @@ -423,10 +458,13 @@ class bloom_filter_alloc { uint64_t seed_; uint16_t num_hashes_; bit_array_alloc bit_array_; + + friend class bloom_filter_builder_alloc; }; } // namespace datasketches +#include "bloom_filter_builder_impl.hpp" #include "bloom_filter_impl.hpp" #endif // _BLOOM_FILTER_HPP_ b diff --git a/filters/include/bloom_filter_builder_impl.hpp b/filters/include/bloom_filter_builder_impl.hpp new file mode 100644 index 00000000..a176aadf --- /dev/null +++ b/filters/include/bloom_filter_builder_impl.hpp @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _BLOOM_FILTER_BUILDER_IMPL_HPP_ +#define _BLOOM_FILTER_BUILDER_IMPL_HPP_ + +#include +#include +#include + +#include "common_defs.hpp" +#include "xxhash64.h" + +namespace datasketches { + +template +uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const uint64_t num_distinct_items, + const uint64_t num_filter_bits) { + // TODO: validate inputs > 0 + return static_cast(std::ceil(static_cast(num_filter_bits) / num_distinct_items * log(2.0))); +} + +template +uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const double target_false_positive_prob) { + return static_cast(std::ceil(-log(target_false_positive_prob) / log(2.0))); +} + +template +uint64_t bloom_filter_builder_alloc::suggest_num_filter_bits(const uint64_t max_distinct_items, + const double target_false_positive_prob) { + return static_cast(std::ceil(-static_cast(max_distinct_items) * log(target_false_positive_prob) / (log(2.0) * log(2.0)))); +} + + +template +bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(const uint64_t num_distinct_items, + const double target_false_positive_prob, + const A& allocator) { + union { + int64_t long_value; + double double_value; + } ldu; + ldu.double_value = random_utils::next_double(random_utils::rand); + const uint64_t seed = ldu.long_value; + return create_by_accuracy(num_distinct_items, target_false_positive_prob, seed, allocator); +} + +template +bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(const uint64_t num_distinct_items, + const double target_false_positive_prob, + const uint64_t seed, + const A& allocator) { + const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(num_distinct_items, target_false_positive_prob); + const uint16_t num_hashes = bloom_filter_builder_alloc::suggest_num_hashes(target_false_positive_prob); + return bloom_filter_alloc(num_filter_bits, num_hashes, seed, allocator); +} + +template +bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(const uint64_t num_bits, + const uint16_t num_hashes, + const A& allocator) { + union { + int64_t long_value; + double double_value; + } ldu; + ldu.double_value = random_utils::next_double(random_utils::rand); + const uint64_t seed = ldu.long_value; + return create_by_size(num_bits, num_hashes, seed, allocator); +} + +template +bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(const uint64_t num_bits, + const uint16_t num_hashes, + const uint64_t seed, + const A& allocator) { + return bloom_filter_alloc(num_bits, num_hashes, seed, allocator); +} + +} // namespace datasketches + +#endif // _BLOOM_FILTER_BUILDER_IMPL_HPP_ \ No newline at end of file diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 162a4baa..5a46ed8e 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -46,7 +46,7 @@ bool bloom_filter_alloc::is_empty() const { } template -uint64_t bloom_filter_alloc::get_bits_used() const { +uint64_t bloom_filter_alloc::get_bits_used() { return bit_array_.get_num_bits_set(); } @@ -130,11 +130,11 @@ void bloom_filter_alloc::update(const double item) { int64_t long_value; double double_value; } ldu; - ldu.doubleBytes = static_cast(item); + ldu.double_value = static_cast(item); if (item == 0.0) { - ldu.doubleBytes = 0.0; // canonicalize -0.0 to 0.0 - } else if (std::isnan(ldu.doubleBytes)) { - ldu.longBytes = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() + ldu.double_value = 0.0; // canonicalize -0.0 to 0.0 + } else if (std::isnan(ldu.double_value)) { + ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() } const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_); const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0); @@ -223,11 +223,11 @@ bool bloom_filter_alloc::query_and_update(const double item) { int64_t long_value; double double_value; } ldu; - ldu.doubleBytes = item; + ldu.double_value = item; if (item == 0.0) { - ldu.doubleBytes = 0.0; // canonicalize -0.0 to 0.0 - } else if (std::isnan(ldu.doubleBytes)) { - ldu.longBytes = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() + ldu.double_value = 0.0; // canonicalize -0.0 to 0.0 + } else if (std::isnan(ldu.double_value)) { + ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() } const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_); const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0); @@ -251,7 +251,7 @@ template bool bloom_filter_alloc::internal_query_and_update(const uint64_t h0, const uint64_t h1) { const uint64_t num_bits = bit_array_.get_capacity(); bool value_exists = true; - for (uint16_t i = 0; i < num_hashes_; i++) { + for (uint16_t i = 1; i <= num_hashes_; i++) { const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; value_exists &= bit_array_.get_and_set_bit(hash_index); } @@ -345,7 +345,7 @@ bool bloom_filter_alloc::query(const void* item, size_t size) const { template bool bloom_filter_alloc::internal_query(const uint64_t h0, const uint64_t h1) const { const uint64_t num_bits = bit_array_.get_capacity(); - for (uint16_t i = 0; i < num_hashes_; i++) { + for (uint16_t i = 1; i <= num_hashes_; i++) { const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; if (!bit_array_.get_bit(hash_index)) return false; @@ -381,6 +381,28 @@ void bloom_filter_alloc::invert() { bit_array_.invert(); } +template +string bloom_filter_alloc::to_string(bool print_filter) const { + // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements. + // The stream does not support passing an allocator instance, and alternatives are complicated. + std::ostringstream oss; + oss << "### Bloom Filter Summary:" << std::endl; + oss << " num_bits : " << bit_array_.get_capacity() << std::endl; + oss << " num_hashes : " << num_hashes_ << std::endl; + oss << " seed : " << seed_ << std::endl; + oss << " bits_used : " << bit_array_.get_num_bits_set() << std::endl; + oss << " fill % : " << static_cast(bit_array_.get_num_bits_set() * 100) / bit_array_.get_capacity() << std::endl; + oss << "### End filter summary" << std::endl; + + if (print_filter) { + oss << bit_array_.to_string(); + } + + oss << std::endl; + return oss.str(); +} + + } // namespace datasketches #endif // _BLOOM_FILTER_IMPL_HPP_ \ No newline at end of file diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index a16f29f2..261f11fa 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -33,9 +33,178 @@ static std::string testBinaryInputPath = "test/"; namespace datasketches { TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") { - REQUIRE_THROWS_AS(bloom_filter(0, 4, DEFAULT_SEED), std::invalid_argument); - REQUIRE_THROWS_AS(bloom_filter(1L << 60, 4, DEFAULT_SEED), std::invalid_argument); - REQUIRE_THROWS_AS(bloom_filter(65535, 0, DEFAULT_SEED), std::invalid_argument); + REQUIRE_THROWS_AS(bloom_filter_builder::create_by_size(0, 4), std::invalid_argument); + REQUIRE_THROWS_AS(bloom_filter_builder::create_by_size(1L << 60, 4), std::invalid_argument); + REQUIRE_THROWS_AS(bloom_filter_builder::create_by_size(65535, 0), std::invalid_argument); } +TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") { + uint64_t num_items = 4000; + double fpp = 0.01; + + uint64_t num_bits = bloom_filter_builder::suggest_num_filter_bits(num_items, fpp); + uint16_t num_hashes = bloom_filter_builder::suggest_num_hashes(num_items, num_bits); + uint64_t seed = 89023; + + auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes, seed); + uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64 + REQUIRE(bf.get_capacity() == adjusted_num_bits); + REQUIRE(bf.get_num_hashes() == num_hashes); + REQUIRE(bf.is_empty()); +} + +TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") { + uint64_t num_bits = 8192; + uint16_t num_hashes = 3; + + auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes); + REQUIRE(bf.is_empty()); + REQUIRE(bf.get_capacity() == num_bits); // num_bits is multiple of 64 so should be exact + REQUIRE(bf.get_num_hashes() == num_hashes); + REQUIRE(bf.get_bits_used() == 0); + + uint64_t n = 1000; + for (uint64_t i = 0; i < n; ++i) { + bf.query_and_update(i); + } + + REQUIRE(!bf.is_empty()); + // these assume the filter isn't too close to capacity + REQUIRE(bf.get_bits_used() <= n * num_hashes); + REQUIRE(bf.get_bits_used() >= n * (num_hashes - 1)); + + uint32_t num_found = 0; + for (uint64_t i = 0; i < n; ++i) { + if (bf.query(i)) { + ++num_found; + } + } + REQUIRE(num_found >= n); + REQUIRE(num_found < 1.1 * n); + + bf.reset(); + // repeat initial tests from above + REQUIRE(bf.is_empty()); + REQUIRE(bf.get_capacity() == num_bits); + REQUIRE(bf.get_num_hashes() == num_hashes); + REQUIRE(bf.get_bits_used() == 0); +} + + +TEST_CASE("bloom_filter: inversion", "[bloom_filter]") { + uint64_t num_bits = 8192; + uint16_t num_hashes = 3; + + auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes); + + uint64_t n = 500; + for (uint64_t i = 0; i < n; ++i) { + bf.update(i); + } + uint64_t num_bits_set = bf.get_bits_used(); + bf.invert(); + REQUIRE(bf.get_bits_used() == num_bits - num_bits_set); + + // original items should be mostly not-present + uint32_t num_found = 0; + for (uint64_t i = 0; i < n; ++i) { + if (bf.query(i)) { + ++num_found; + } + } + REQUIRE(num_found < n / 10); + + // many other items should be "present" + num_found = 0; + for (uint64_t i = n; i < num_bits; ++i) { + if (bf.query(i)) { + ++num_found; + } + } + REQUIRE(num_found > n); +} + +TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") { + uint64_t num_bits = 32768; + uint16_t num_hashes = 4; + + auto bf1 = bloom_filter_builder::create_by_size(num_bits, num_hashes); + + // mismatched num bits + auto bf2 = bloom_filter_builder::create_by_size(2 * num_bits, num_hashes); + REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument); + + // mismatched num hashes + auto bf3 = bloom_filter_builder::create_by_size(num_bits, 2 * num_hashes); + REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument); + + // mismatched seed + auto bf4 = bloom_filter_builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1); + REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument); +} + +TEST_CASE("bloom_filter: basic union", "[bloom_filter]") { + const uint64_t num_bits = 12288; + const uint16_t num_hashes = 4; + + auto bf1 = bloom_filter_builder::create_by_size(num_bits, num_hashes); + auto bf2 = bloom_filter_builder::create_by_size(num_bits, num_hashes, bf1.get_seed()); + + const uint64_t n = 1000; + const uint32_t max_item = 3 * n / 2 - 1; + for (uint64_t i = 0; i < n; ++i) { + bf1.query_and_update(i); + bf2.update(n / 2 + i); + } + + bf1.union_with(bf2); + for (uint64_t i = 0; i < max_item; ++i) { + REQUIRE(bf1.query(i)); + } + + uint32_t num_found = 0; + for (uint64_t i = max_item; i < num_bits; ++i) { + if (bf1.query(i)) { + ++num_found; + } + } + REQUIRE(num_found < num_bits / 10); // not being super strict +} + +TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") { + const uint64_t num_bits = 8192; + const uint16_t num_hahes = 5; + + auto bf1 = bloom_filter_builder::create_by_size(num_bits, num_hahes); + auto bf2 = bloom_filter_builder::create_by_size(num_bits, num_hahes, bf1.get_seed()); + + const uint64_t n = 1024; + const uint32_t max_item = 3 * n / 2 - 1; + for (uint64_t i = 0; i < n; ++i) { + bf1.update(i); + bf2.update(n / 2 + i); + } + + bf1.intersect(bf2); + // overlap bit should all be set + for (uint64_t i = n / 2; i < n; ++i) { + REQUIRE(bf1.query(i)); + } + + uint32_t num_found = 0; + for (uint64_t i = 0; i < n / 2; ++i) { + if (bf1.query(i)) { + ++num_found; + } + } + for (uint64_t i = max_item; i < num_bits; ++i) { + if (bf1.query(i)) { + ++num_found; + } + } + + REQUIRE(num_found < num_bits / 10); // not being super strict +} + + } // namespace datasketches From a5686a67dfd23c4a5eaafd3e3c1fea8093e6eeb0 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:24:09 -0700 Subject: [PATCH 06/30] Move to bit array as plain memory, move operations to static functions, to ultimately simplify memory handling with wrapped filters --- filters/CMakeLists.txt | 4 +- filters/include/bit_array.hpp | 165 ------------------ filters/include/bit_array_impl.hpp | 228 ------------------------- filters/include/bit_array_ops.hpp | 121 +++++++++++++ filters/include/bit_array_ops_impl.hpp | 108 ++++++++++++ filters/include/bloom_filter.hpp | 17 +- filters/include/bloom_filter_impl.hpp | 102 ++++++++--- filters/test/CMakeLists.txt | 2 +- filters/test/bit_array_ops_test.cpp | 107 ++++++++++++ filters/test/bit_array_test.cpp | 132 -------------- filters/test/bloom_filter_test.cpp | 4 - 11 files changed, 436 insertions(+), 554 deletions(-) delete mode 100644 filters/include/bit_array.hpp delete mode 100644 filters/include/bit_array_impl.hpp create mode 100644 filters/include/bit_array_ops.hpp create mode 100644 filters/include/bit_array_ops_impl.hpp create mode 100644 filters/test/bit_array_ops_test.cpp delete mode 100644 filters/test/bit_array_test.cpp diff --git a/filters/CMakeLists.txt b/filters/CMakeLists.txt index 2ded3ed3..45da324b 100644 --- a/filters/CMakeLists.txt +++ b/filters/CMakeLists.txt @@ -39,6 +39,6 @@ install(FILES include/bloom_filter.hpp include/bloom_filter_impl.hpp include/bloom_filter_builder_impl.hpp - include/bit_array.hpp - include/bit_array_impl.hpp + include/bit_array_ops.hpp + include/bit_array_ops_impl.hpp DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/filters/include/bit_array.hpp b/filters/include/bit_array.hpp deleted file mode 100644 index 78c088bb..00000000 --- a/filters/include/bit_array.hpp +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifndef _BIT_ARRAY_HPP_ -#define _BIT_ARRAY_HPP_ - -#include -#include - -#include "common_defs.hpp" - -namespace datasketches { - -// forward declarations -template class bit_array_alloc; - -/// bit_array alias with default allocator -using bit_array = bit_array_alloc>; - -/** - * This class holds an array of bits suitable for use in a Bloom Filter. - * The representation is not compressed and is designed to fit in a single array - * in Java, meaning that the maximum number of bits is limited by the maximize - * size of an array of longs. - * - * For compatibility with Java, rounds the number of bits up to the smallest multiple of 64 - * (one long) that is not smaller than the specified number. - */ -template> -class bit_array_alloc { - using A = Allocator; - -public: - /** - * Construct a bit array with the given number of bits. - * @param numBits the number of bits to represent. - */ - explicit bit_array_alloc(const uint64_t num_bits, const Allocator& allocator = Allocator()); - - bool is_empty() const; - - bool is_dirty() const; - - /** - * Get the value of a bit at the given index. - * @param index the index of the bit to get - * @return the value of the bit at the given index. - */ - bool get_bit(const uint64_t index) const; - - /** - * Set the bit at the given index to 1. - * @param index the index of the bit to set. - */ - void set_bit(const uint64_t index); - - /** - * Set the bit at the given index to 0. - * @param index the index of the bit to clear. - */ - void clear_bit(const uint64_t index); - - /** - * Assign the value of the bit at the given index. - * @param index the index of the bit to set. - */ - void assign_bit(const uint64_t index, const bool value); - - /** - * Gets teh value of a bit at the specified index and sets it to true - * @poaram index the index of the bit to get and set - * @return the value of the bit at the specified index - */ - bool get_and_set_bit(const uint64_t index); - - /** - * @brief Gets the number of bits set in the bit array. - * @return the number of bits set in the bit array. - */ - uint64_t get_num_bits_set() const; - - /** - * @brief Gets the number of bits set in the bit array. - * @return the number of bits set in the bit array. - */ - uint64_t get_num_bits_set(); - - /** - * Resets the bit_aray, setting all bits to 0. - */ - void reset(); - - /** - * Gets the capacity of the bit array in bits. - * @return the capacity of the bit array in bits. - */ - uint64_t get_capacity() const; - - /** - * Performs a union operation on this bit array with another bit array. - * This operation modifies the current bit array to be the union of its original bits and the bits of the other array. - * The union operation is equivalent to a bitwise OR operation between the two arrays. - * - * @param other The other bit array to union with this one. - */ - void union_with(const bit_array_alloc& other); - - /** - * Performs an intersection operation on this bit array with another bit array. - * This operation modifies the current bit array to contain only the bits that are set in both this array and the other array. - * The intersection operation is equivalent to a bitwise AND operation between the two arrays. - * - * @param other The other bit array to intersect with this one. - */ - void intersect(const bit_array_alloc& other); - - /** - * Inverts the bits of this bit array. - * This operation modifies the current bit array by flipping all its bits; 0s become 1s and 1s become 0s. - */ - void invert(); - - /** - * Returns a string representation of the bit_array - * @return a string representation of the bit_array - */ - string to_string() const; - - /** - * @brief Get the allocator object - * - * @return Allocator - */ - Allocator get_allocator() const; - -private: - A allocator_; - uint64_t num_bits_set_; // if -1, need to recompute value - bool is_dirty_; - std::vector data_; - - uint64_t count_bits_set() const; -}; - -} // namespace datasketches - -#include "bit_array_impl.hpp" - -#endif // _BIT_ARRAY_HPP_ \ No newline at end of file diff --git a/filters/include/bit_array_impl.hpp b/filters/include/bit_array_impl.hpp deleted file mode 100644 index 41ae9f64..00000000 --- a/filters/include/bit_array_impl.hpp +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifndef _BIT_ARRAY_IMPL_HPP_ -#define _BIT_ARRAY_IMPL_HPP_ - -#include -#include -#include - -#include "common_defs.hpp" -#include "bit_array.hpp" - -namespace datasketches { - -template -bit_array_alloc::bit_array_alloc(const uint64_t num_bits, const A& allocator) : - allocator_(allocator), - num_bits_set_(0), - is_dirty_(false) - { - if (num_bits == 0) { - throw std::invalid_argument("Number of bits must be greater than zero"); - } else if (num_bits >= (((1ULL << 31) - 1) * 64)) { - throw std::invalid_argument("Bits must be representable in fewer than 2^31 64-bit values"); - } - - // round up to the nearest multiple of 64, in bytes - data_ = std::vector(((num_bits + 63) >> 6) << 3, 0, allocator); -} - -template -bool bit_array_alloc::is_empty() const { - return !is_dirty_ && num_bits_set_ == 0; -} - -template -bool bit_array_alloc::is_dirty() const { - return is_dirty_; -} - -template -bool bit_array_alloc::get_bit(const uint64_t index) const { - if (index >= data_.size() << 3) { - throw std::out_of_range("Index out of range"); - } - return (data_[index >> 3] & (1 << (index & 7))) != 0; -} - -template -void bit_array_alloc::set_bit(const uint64_t index) { - if (index >= (data_.size() << 3)) { - std::cout << "index: " << index << ", size: " << data_.size() << ", size << 3: " << (data_.size() << 3) << std::endl; - throw std::out_of_range("Index out of range"); - } - data_[index >> 3] |= (1 << (index & 7)); - is_dirty_ = true; -} - -template -void bit_array_alloc::clear_bit(const uint64_t index) { - if (index >= data_.size() << 3) { - throw std::out_of_range("Index out of range"); - } - data_[index >> 3] &= ~(1 << (index & 7)); - is_dirty_ = true; -} - -template -void bit_array_alloc::assign_bit(const uint64_t index, const bool value) { - if (value) { - set_bit(index); - } else { - clear_bit(index); - } -} - -template -bool bit_array_alloc::get_and_set_bit(const uint64_t index) { - if (index >= data_.size() << 3) { - throw std::out_of_range("Index out of range"); - } - const uint64_t offset = index >> 3; - const uint8_t mask = 1 << (index & 7); - if ((data_[offset] & mask) != 0) { - return true; - } else { - data_[offset] |= mask; - ++num_bits_set_; // increment the number of bits set regardless of is_dirty_ - return false; - } -} - -template -uint64_t bit_array_alloc::count_bits_set() const { - uint64_t num_bits_set = 0; - - // we rounded up to a multiple of 64 so we know we can use 64-bit operations - const uint64_t* data64 = reinterpret_cast(data_.data()); - // Calculate the number of 64-bit chunks - uint64_t num_longs = data_.size() / 8; // 8 bytes per 64 bits - for (uint64_t i = 0; i < num_longs; ++i) { - // Wrap the 64-bit chunk with std::bitset for easy bit counting - std::bitset<64> bits(data64[i]); - num_bits_set += bits.count(); - } - return num_bits_set; -} - -template -uint64_t bit_array_alloc::get_num_bits_set() { - if (is_dirty_) { - num_bits_set_ = count_bits_set(); - } - return num_bits_set_; -} - -template -uint64_t bit_array_alloc::get_num_bits_set() const { - if (is_dirty_) { - return count_bits_set(); - } - return num_bits_set_; -} - -template -uint64_t bit_array_alloc::get_capacity() const { - return data_.size() << 3; // size in bits -} - -template -void bit_array_alloc::reset() { - uint8_t* data = data_.data(); - std::fill(data, data + data_.size(), 0); - num_bits_set_ = 0; - is_dirty_ = false; -} - -template -void bit_array_alloc::union_with(const bit_array_alloc& other) { - if (data_.size() != other.data_.size()) { - throw std::invalid_argument("Cannot union bit arrays with unequal lengths"); - } - - num_bits_set_ = 0; - for (uint64_t i = 0; i < data_.size(); ++i) { - data_[i] |= other.data_[i]; - std::bitset<8> bits(data_[i]); - num_bits_set_ += bits.count(); - } - is_dirty_ = false; -} - -template -void bit_array_alloc::intersect(const bit_array_alloc& other) { - if (data_.size() != other.data_.size()) { - throw std::invalid_argument("Cannot intersect bit arrays with unequal lengths"); - } - - num_bits_set_ = 0; - for (uint64_t i = 0; i < data_.size(); ++i) { - data_[i] &= other.data_[i]; - std::bitset<8> bits(data_[i]); - num_bits_set_ += bits.count(); - } - is_dirty_ = false; -} - -template -void bit_array_alloc::invert() { - if (is_dirty_) { - num_bits_set_ = 0; - for (uint64_t i = 0; i < data_.size(); ++i) { - data_[i] = ~data_[i]; - std::bitset<8> bits(data_[i]); - num_bits_set_ += bits.count(); - } - is_dirty_ = false; - } else { - for (uint64_t i = 0; i < data_.size(); ++i) { - data_[i] = ~data_[i]; - } - num_bits_set_ = get_capacity() - num_bits_set_; - } -} - -template -A bit_array_alloc::get_allocator() const { - return allocator_; -} - -template -string bit_array_alloc::to_string() const { - std::ostringstream oss; - uint64_t num_blocks = data_.size() / 8; // groups of 64 bits - for (uint64_t i = 0; i < num_blocks; ++i) { - oss << i << ": "; - for (uint64_t j = 0; j < 8; ++j) { // bytes w/in a block - for (uint64_t b = 0; b < 8; ++b) { // bits w/in a byte - oss << ((data_[i * 8 + j] & (1 << b)) ? "1" : "0"); - } - oss << " "; - } - oss << std::endl; - } - oss << std::endl; - return oss.str(); -} - -} // namespace datasketches - -#endif // _BIT_ARRAY_IMPL_HPP_ \ No newline at end of file diff --git a/filters/include/bit_array_ops.hpp b/filters/include/bit_array_ops.hpp new file mode 100644 index 00000000..9975d43a --- /dev/null +++ b/filters/include/bit_array_ops.hpp @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _BIT_ARRAY_OPS_HPP_ +#define _BIT_ARRAY_OPS_HPP_ + +namespace datasketches { + +/** + * This class comprises methods that operate one or more arrays of bits (uint8_t*) to + * provide bit array operations. The class does not take ownership of memory and operates On + * arrays in-place. Sizes of the arrays, in bytes, are passed in as arguments. + * + * None of the methods in this class perform bounds checks. The caller is responsible for ensuring + * that indices are within the array bounds. + * + * Implementation assumes the actual arrays are multiples of 64 bits in length. + */ +namespace bit_array_ops { + + /** + * Get the value of a bit at the given index. + * @param array the array of bits + * @param index the index of the bit to get + * @return the value of the bit at the given index. + */ + static bool get_bit(uint8_t* array, const uint64_t index); + + /** + * Set the bit at the given index to 1. + * @param array the array of bits + * @param index the index of the bit to set. + */ + static void set_bit(uint8_t* array, const uint64_t index); + + /** + * Set the bit at the given index to 0. + * @param array the array of bits + * @param index the index of the bit to clear. + */ + static void clear_bit(uint8_t* array, const uint64_t index); + + /** + * Assign the value of the bit at the given index. + * @param array the array of bits + * @param index the index of the bit to set. + */ + static void assign_bit(uint8_t* array, const uint64_t index, const bool value); + + /** + * Gets teh value of a bit at the specified index and sets it to true + * @param array the array of bits + * @param index the index of the bit to get and set + * @return the value of the bit at the specified index + */ + static bool get_and_set_bit(uint8_t* array, const uint64_t index); + + /** + * @brief Gets the number of bits set in the bit array. + * @param array the array of bits + * @param length_bytes the length of the array, in bytes + * @return the number of bits set in the bit array. + */ + static uint64_t count_num_bits_set(uint8_t* array, const uint64_t length_bytes); + + /** + * Performs a union operation on one bit array with another bit array. + * This operation modifies the tgt bit array to be the union of its original bits and the bits of the src array. + * The union operation is equivalent to a bitwise OR operation between the two arrays. + * + * @param tgt the array of bits into which the results are written + * @param src the array of bits to union into tgt + * @param length_bytes the length of the two arrays, in bytes + * @return the number of bits set in the resulting array + */ + static uint64_t union_with(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes); + + /** + * Performs an intersection operation on one bit array with another bit array. + * This operation modifies the tgt bit array to contain only the bits that are set in both that array and the src array. + * The intersection operation is equivalent to a bitwise AND operation between the two arrays. + * + * @param tgt the array of bits into which the results are written + * @param src the array of bits to intersect with tgt + * @param length_bytes the length of the two arrays, in bytes + * @return the number of bits set in the resulting array + */ + static uint64_t intersect(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes); + + /** + * Inverts the bits of this bit array. + * This operation modifies the bit array by flipping all its bits; 0s become 1s and 1s become 0s. + * @param array the array of bits + * @param length_bytes the length of the array, in bytes + * @return the number of bits set in the resulting array + */ + static uint64_t invert(uint8_t* array, const uint64_t length_bytes); + +} // namespace bit_array_ops + +} // namespace datasketches + +#include "bit_array_ops_impl.hpp" + +#endif // _BIT_ARRAY_OPS_HPP_ \ No newline at end of file diff --git a/filters/include/bit_array_ops_impl.hpp b/filters/include/bit_array_ops_impl.hpp new file mode 100644 index 00000000..e6af4159 --- /dev/null +++ b/filters/include/bit_array_ops_impl.hpp @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _BIT_ARRAY_OPS_IMPL_HPP_ +#define _BIT_ARRAY_OPS_IMPL_HPP_ + +#include + +#include "bit_array_ops.hpp" + +namespace datasketches { + +bool bit_array_ops::get_bit(uint8_t* array, const uint64_t index) { + return (array[index >> 3] & (1 << (index & 7))) != 0; +} + +void bit_array_ops::set_bit(uint8_t* array, const uint64_t index) { + array[index >> 3] |= (1 << (index & 7)); +} + +void bit_array_ops::clear_bit(uint8_t* array, const uint64_t index) { + array[index >> 3] &= ~(1 << (index & 7)); +} + +void bit_array_ops::assign_bit(uint8_t* array, const uint64_t index, const bool value) { + // read-only checks handled by set_bit() and clear_bit() + if (value) { + set_bit(array, index); + } else { + clear_bit(array, index); + } +} + +bool bit_array_ops::get_and_set_bit(uint8_t* array, const uint64_t index) { + const uint64_t offset = index >> 3; + const uint8_t mask = 1 << (index & 7); + if ((array[offset] & mask) != 0) { + return true; + } else { + array[offset] |= mask; + return false; + } +} + +uint64_t bit_array_ops::count_num_bits_set(uint8_t* array, const uint64_t length_bytes) { + uint64_t num_bits_set = 0; + + // we rounded up to a multiple of 64 so we know we can use 64-bit operations + const uint64_t* array64 = reinterpret_cast(array); + // Calculate the number of 64-bit chunks + uint64_t num_longs = length_bytes / 8; // 8 bytes per 64 bits + for (uint64_t i = 0; i < num_longs; ++i) { + // Wrap the 64-bit chunk with std::bitset for easy bit counting + std::bitset<64> bits(array64[i]); + num_bits_set += bits.count(); + } + return num_bits_set; +} + +uint64_t bit_array_ops::union_with(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes) { + uint64_t num_bits_set = 0; + for (uint64_t i = 0; i < length_bytes; ++i) { + tgt[i] |= src[i]; + std::bitset<8> bits(tgt[i]); + num_bits_set += bits.count(); + } + return num_bits_set; +} + +uint64_t bit_array_ops::intersect(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes) { + uint64_t num_bits_set = 0; + for (uint64_t i = 0; i < length_bytes; ++i) { + tgt[i] &= src[i]; + std::bitset<8> bits(tgt[i]); + num_bits_set += bits.count(); + } + return num_bits_set; +} + +uint64_t bit_array_ops::invert(uint8_t* array, const uint64_t length_bytes) { + uint64_t num_bits_set = 0; + for (uint64_t i = 0; i < length_bytes; ++i) { + array[i] = ~array[i]; + std::bitset<8> bits(array[i]); + num_bits_set += bits.count(); + } + return num_bits_set; +} + +} // namespace datasketches + +#endif // _BIT_ARRAY_OPS_IMPL_HPP_ \ No newline at end of file diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index e9076c81..5e0ed75c 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -20,10 +20,10 @@ #ifndef _BLOOM_FILTER_HPP_ #define _BLOOM_FILTER_HPP_ +#include #include #include -#include "bit_array.hpp" #include "common_defs.hpp" namespace datasketches { @@ -445,8 +445,15 @@ class bloom_filter_alloc { */ string to_string(bool print_filter = false) const; + /** + * @brief Destroy the bloom filter alloc object + */ + ~bloom_filter_alloc(); private: + static const uint64_t MAX_HEADER_SIZE_BYTES = 32; // 4 Java Longs + static const uint64_t MAX_FILTER_SIZE_BITS = (INT32_MAX - MAX_HEADER_SIZE_BYTES) * sizeof(uint64_t); + bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const Allocator& allocator = Allocator()); // internal query/update methods @@ -457,7 +464,13 @@ class bloom_filter_alloc { A allocator_; uint64_t seed_; uint16_t num_hashes_; - bit_array_alloc bit_array_; + bool is_dirty_; + bool is_owned_; // if true, data is not owned by filter AND data_ holdes the entire filter not just the bit array + bool is_read_only_; // if true, filter is read-only + uint64_t capacity_bits_; + uint64_t num_bits_set_; + uint8_t* bit_array_; // data backing bit_array_, regardless of ownership + uint8_t* memory_; // if wrapped, pointer to the start of the filter, otheriwse nullptr friend class bloom_filter_builder_alloc; }; diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 5a46ed8e..7186763e 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -24,6 +24,7 @@ #include #include "common_defs.hpp" +#include "bit_array_ops.hpp" #include "xxhash64.h" namespace datasketches { @@ -33,26 +34,64 @@ bloom_filter_alloc::bloom_filter_alloc(const uint64_t num_bits, const uint16_ allocator_(allocator), seed_(seed), num_hashes_(num_hashes), - bit_array_(num_bits, allocator) + is_dirty_(false), + is_owned_(true), + is_read_only_(false), + capacity_bits_(((num_bits + 63) >> 6) << 6), // can round to nearest multiple of 64 prior to bounds checks + num_bits_set_(0) { if (num_hashes == 0) { throw std::invalid_argument("Must have at least 1 hash function"); } + if (num_bits == 0) { + throw std::invalid_argument("Number of bits must be greater than zero"); + } else if (num_bits > MAX_FILTER_SIZE_BITS) { + throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits"); + } + + const uint64_t num_bytes = capacity_bits_ >> 3; + bit_array_ = allocator_.allocate(num_bytes); + std::fill_n(bit_array_, num_bytes, 0); + if (bit_array_ == nullptr) { + throw std::bad_alloc(); + } + memory_ = nullptr; + } + +template +bloom_filter_alloc::~bloom_filter_alloc() { + // TODO: handle when only bit_array_ is used + // TODO: handle when memory_ is used and bit_array_ is a pointer into it + /* + if (is_owned_ && memory_ != nullptr) { + allocator_.deallocate(memory_, capacity_bits_ >> 3); + memory_ = nullptr; + bit_array_ = nullptr; // just to be safe + } + */ + if (memory_ == nullptr && bit_array_ != nullptr) { + allocator_.deallocate(bit_array_, capacity_bits_ >> 3); + bit_array_ = nullptr; } +} template bool bloom_filter_alloc::is_empty() const { - return bit_array_.is_empty(); + return !is_dirty_ && num_bits_set_ == 0; } template uint64_t bloom_filter_alloc::get_bits_used() { - return bit_array_.get_num_bits_set(); + if (is_dirty_) { + num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3); + is_dirty_ = false; + } + return num_bits_set_; } template uint64_t bloom_filter_alloc::get_capacity() const { - return bit_array_.get_capacity(); + return capacity_bits_; } template @@ -67,7 +106,10 @@ uint64_t bloom_filter_alloc::get_seed() const { template void bloom_filter_alloc::reset() { - bit_array_.reset(); + // TODO: if wrapped, update num_bits_set in memory, too + num_bits_set_ = 0; + is_dirty_ = false; + std::fill(bit_array_, bit_array_ + (capacity_bits_ >> 3), 0); } // UPDATE METHODS @@ -156,11 +198,12 @@ void bloom_filter_alloc::update(const void* item, size_t size) { template void bloom_filter_alloc::internal_update(const uint64_t h0, const uint64_t h1) { - const uint64_t num_bits = bit_array_.get_capacity(); + const uint64_t num_bits = get_capacity(); for (uint16_t i = 1; i <= num_hashes_; i++) { const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; - bit_array_.set_bit(hash_index); + bit_array_ops::set_bit(bit_array_, hash_index); } + is_dirty_ = true; } // QUERY-AND-UPDATE METHODS @@ -249,11 +292,13 @@ bool bloom_filter_alloc::query_and_update(const void* item, size_t size) { template bool bloom_filter_alloc::internal_query_and_update(const uint64_t h0, const uint64_t h1) { - const uint64_t num_bits = bit_array_.get_capacity(); + const uint64_t num_bits = get_capacity(); bool value_exists = true; for (uint16_t i = 1; i <= num_hashes_; i++) { const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; - value_exists &= bit_array_.get_and_set_bit(hash_index); + bool value = bit_array_ops::get_and_set_bit(bit_array_, hash_index); + num_bits_set_ += value ? 0 : 1; + value_exists &= value; } return value_exists; } @@ -344,10 +389,10 @@ bool bloom_filter_alloc::query(const void* item, size_t size) const { template bool bloom_filter_alloc::internal_query(const uint64_t h0, const uint64_t h1) const { - const uint64_t num_bits = bit_array_.get_capacity(); + const uint64_t num_bits = get_capacity(); for (uint16_t i = 1; i <= num_hashes_; i++) { const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; - if (!bit_array_.get_bit(hash_index)) + if (!bit_array_ops::get_bit(bit_array_, hash_index)) return false; } return true; @@ -357,7 +402,10 @@ bool bloom_filter_alloc::internal_query(const uint64_t h0, const uint64_t h1) template bool bloom_filter_alloc::is_compatible(const bloom_filter_alloc& other) const { - return seed_ == other.seed_ && num_hashes_ == other.num_hashes_ && bit_array_.get_capacity() == other.bit_array_.get_capacity(); + return seed_ == other.seed_ + && num_hashes_ == other.num_hashes_ + && get_capacity() == other.get_capacity() + ; } template @@ -365,7 +413,8 @@ void bloom_filter_alloc::union_with(const bloom_filter_alloc& other) { if (!is_compatible(other)) { throw std::invalid_argument("Incompatible bloom filters"); } - bit_array_.union_with(other.bit_array_); + num_bits_set_ = bit_array_ops::union_with(bit_array_, other.bit_array_, capacity_bits_ >> 3); + is_dirty_ = false; } template @@ -373,12 +422,14 @@ void bloom_filter_alloc::intersect(const bloom_filter_alloc& other) { if (!is_compatible(other)) { throw std::invalid_argument("Incompatible bloom filters"); } - bit_array_.intersect(other.bit_array_); + num_bits_set_ = bit_array_ops::intersect(bit_array_, other.bit_array_, capacity_bits_ >> 3); + is_dirty_ = false; } template void bloom_filter_alloc::invert() { - bit_array_.invert(); + num_bits_set_ = bit_array_ops::invert(bit_array_, capacity_bits_ >> 3); + is_dirty_ = false; } template @@ -387,19 +438,30 @@ string bloom_filter_alloc::to_string(bool print_filter) const { // The stream does not support passing an allocator instance, and alternatives are complicated. std::ostringstream oss; oss << "### Bloom Filter Summary:" << std::endl; - oss << " num_bits : " << bit_array_.get_capacity() << std::endl; + oss << " num_bits : " << get_capacity() << std::endl; oss << " num_hashes : " << num_hashes_ << std::endl; oss << " seed : " << seed_ << std::endl; - oss << " bits_used : " << bit_array_.get_num_bits_set() << std::endl; - oss << " fill % : " << static_cast(bit_array_.get_num_bits_set() * 100) / bit_array_.get_capacity() << std::endl; + oss << " bits_used : " << get_bits_used() << std::endl; + oss << " fill % : " << (get_bits_used() * 100.0) / get_capacity() << std::endl; oss << "### End filter summary" << std::endl; if (print_filter) { - oss << bit_array_.to_string(); + uint64_t num_blocks = capacity_bits_ >> 6; // groups of 64 bits + for (uint64_t i = 0; i < num_blocks; ++i) { + oss << i << ": "; + for (uint64_t j = 0; j < 8; ++j) { // bytes w/in a block + for (uint64_t b = 0; b < 8; ++b) { // bits w/in a byte + oss << ((bit_array_[i * 8 + j] & (1 << b)) ? "1" : "0"); + } + oss << " "; + } + oss << std::endl; + } + oss << std::endl; } oss << std::endl; - return oss.str(); + return string(oss.str(), allocator_); } diff --git a/filters/test/CMakeLists.txt b/filters/test/CMakeLists.txt index 9fa7f5de..67615d0f 100644 --- a/filters/test/CMakeLists.txt +++ b/filters/test/CMakeLists.txt @@ -40,6 +40,6 @@ add_test( target_sources(bloom_filter_test PRIVATE - bit_array_test.cpp + bit_array_ops_test.cpp bloom_filter_test.cpp ) diff --git a/filters/test/bit_array_ops_test.cpp b/filters/test/bit_array_ops_test.cpp new file mode 100644 index 00000000..322c17d3 --- /dev/null +++ b/filters/test/bit_array_ops_test.cpp @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include "bit_array_ops.hpp" + +namespace datasketches { + +TEST_CASE("bit_array: basic operation", "[bit_array]") { + uint8_t* data = new uint8_t[16]; + std::fill_n(data, 16, 0); + REQUIRE(bit_array_ops::get_and_set_bit(data, 1) == false); + REQUIRE(bit_array_ops::get_and_set_bit(data, 2) == false); + for (int i = 4; i < 64; i <<= 1) { + REQUIRE(bit_array_ops::get_and_set_bit(data, 64 + i) == false); + } + + REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 6); + REQUIRE(bit_array_ops::get_bit(data, 68)); + + REQUIRE(bit_array_ops::get_bit(data, 5) == false); + bit_array_ops::set_bit(data, 5); + REQUIRE(bit_array_ops::get_and_set_bit(data, 5)); + REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 7); + + bit_array_ops::clear_bit(data, 5); + REQUIRE(bit_array_ops::get_bit(data, 5) == false); + REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 6); + + std::fill(data, data + 16, 0); + REQUIRE(bit_array_ops::count_num_bits_set(data, 16) == 0); + + bit_array_ops::set_bit(data, 35); + REQUIRE(bit_array_ops::get_and_set_bit(data, 35)); + bit_array_ops::assign_bit(data, 35, false); + REQUIRE(bit_array_ops::get_bit(data, 35) == false); + bit_array_ops::assign_bit(data, 35, true); + REQUIRE(bit_array_ops::get_bit(data, 35)); + + delete [] data; +} + +TEST_CASE("bit_array: inversion", "[bit_array]") { + size_t num_bits = 1024; + uint8_t* data = new uint8_t[num_bits / 8]; + std::fill_n(data, num_bits / 8, 0); + for (size_t i = 0; i < num_bits; i += num_bits / 8) { + bit_array_ops::get_and_set_bit(data, i); + } + REQUIRE(bit_array_ops::get_bit(data, 0)); + + size_t num_bits_set = bit_array_ops::count_num_bits_set(data, num_bits / 8); + bit_array_ops::invert(data, num_bits / 8); + REQUIRE(bit_array_ops::count_num_bits_set(data, num_bits / 8) == num_bits - num_bits_set); + REQUIRE(bit_array_ops::get_bit(data, 0) == false); + + delete [] data; +} + +TEST_CASE("bit_array: intersection and union", "[bit_array]") { + uint8_t* data1 = new uint8_t[8]; + uint8_t* data2 = new uint8_t[8]; + uint8_t* data3 = new uint8_t[8]; + std::fill_n(data1, 8, 0); + std::fill_n(data2, 8, 0); + std::fill_n(data3, 8, 0); + + size_t n = 10; + for (size_t i = 0; i < n; ++i) { + bit_array_ops::get_and_set_bit(data1, i); + bit_array_ops::get_and_set_bit(data2, i + (n / 2)); + bit_array_ops::get_and_set_bit(data3, 2 * i); + } + REQUIRE(bit_array_ops::count_num_bits_set(data1, 8) == n); + REQUIRE(bit_array_ops::count_num_bits_set(data2, 8) == n); + REQUIRE(bit_array_ops::count_num_bits_set(data3, 8) == n); + + bit_array_ops::intersect(data1, data2, 8); + REQUIRE(bit_array_ops::count_num_bits_set(data1, 8) == n / 2); + + bit_array_ops::union_with(data3, data2, 8); + REQUIRE(bit_array_ops::count_num_bits_set(data3, 8) == 3 * n / 2); + + delete [] data1; + delete [] data2; + delete [] data3; +} + +} // namespace datasketches diff --git a/filters/test/bit_array_test.cpp b/filters/test/bit_array_test.cpp deleted file mode 100644 index a3760beb..00000000 --- a/filters/test/bit_array_test.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -//#include -//#include -//#include - -#include "bit_array.hpp" - -#ifdef TEST_BINARY_INPUT_PATH -static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH; -#else -static std::string testBinaryInputPath = "test/"; -#endif - -namespace datasketches { - -TEST_CASE("bit_array: invalid num_bits", "[bit_array]") { - REQUIRE_THROWS_AS(bit_array(0), std::invalid_argument); - REQUIRE_THROWS_AS(bit_array(1L << 60), std::invalid_argument); -} - -TEST_CASE("bit_array: construction", "[bit_array]") { - bit_array ba(64); - REQUIRE(ba.get_capacity() == 64); - REQUIRE(ba.get_num_bits_set() == 0); - REQUIRE(ba.is_empty()); - REQUIRE(!ba.is_dirty()); -} - -TEST_CASE("bit_array: basic operation", "[bit_array]") { - bit_array ba(128); - REQUIRE(ba.get_and_set_bit(1) == false); - REQUIRE(ba.get_and_set_bit(2) == false); - for (int i = 4; i < 64; i <<= 1) { - REQUIRE(ba.get_and_set_bit(64 + i) == false); - } - - REQUIRE(ba.get_num_bits_set() == 6); - REQUIRE(ba.get_bit(68)); - REQUIRE(!ba.is_empty()); - - REQUIRE(ba.get_bit(5) == false); - ba.set_bit(5); - REQUIRE(ba.get_and_set_bit(5)); - REQUIRE(ba.get_num_bits_set() == 7); - - ba.clear_bit(5); - REQUIRE(ba.get_bit(5) == false); - REQUIRE(ba.get_num_bits_set() == 6); - - ba.reset(); - REQUIRE(ba.is_empty()); - REQUIRE(ba.get_num_bits_set() == 0); - - ba.set_bit(35); - REQUIRE(ba.get_and_set_bit(35)); - ba.assign_bit(35, false); - REQUIRE(ba.get_bit(35) == false); - ba.assign_bit(35, true); - REQUIRE(ba.get_bit(35)); - - REQUIRE(ba.to_string().length() > 0); -} - -TEST_CASE("bit_array: inversion", "[bit_array]") { - size_t num_bits = 1024; - bit_array ba(num_bits); - for (size_t i = 0; i < num_bits; i += num_bits / 8) { - ba.get_and_set_bit(i); - } - REQUIRE(ba.get_bit(0)); - - size_t num_bits_set = ba.get_num_bits_set(); - ba.invert(); - REQUIRE(ba.get_num_bits_set() == num_bits - num_bits_set); - REQUIRE(ba.get_bit(0) == false); - - // update to make dirty and invert again - ba.set_bit(0); - ba.invert(); - REQUIRE(ba.get_num_bits_set() == num_bits_set - 1); - REQUIRE(ba.get_bit(0) == false); -} - -TEST_CASE("bit_array: invalid union and intersection", "[bit_array]") { - bit_array ba1(64); - bit_array ba2(128); - REQUIRE_THROWS_AS(ba1.union_with(ba2), std::invalid_argument); - REQUIRE_THROWS_AS(ba1.intersect(ba2), std::invalid_argument); -} - -TEST_CASE("bit_array: intersection and union", "[bit_array]") { - bit_array ba1(64); - bit_array ba2(64); - bit_array ba3(64); - - size_t n = 10; - for (size_t i = 0; i < n; ++i) { - ba1.get_and_set_bit(i); - ba2.get_and_set_bit(i + (n / 2)); - ba3.get_and_set_bit(2 * i); - } - REQUIRE(ba1.get_num_bits_set() == n); - REQUIRE(ba2.get_num_bits_set() == n); - REQUIRE(ba3.get_num_bits_set() == n); - - ba1.intersect(ba2); - REQUIRE(ba1.get_num_bits_set() == n / 2); - - ba3.union_with(ba2); - REQUIRE(ba3.get_num_bits_set() == 3 * n / 2); -} - -} // namespace datasketches diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index 261f11fa..dee240bf 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -18,9 +18,6 @@ */ #include -//#include -//#include -//#include #include "bloom_filter.hpp" @@ -90,7 +87,6 @@ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") { REQUIRE(bf.get_bits_used() == 0); } - TEST_CASE("bloom_filter: inversion", "[bloom_filter]") { uint64_t num_bits = 8192; uint16_t num_hashes = 3; From 9dbcde5c2e266d597f3e02993bc7cb346e2c3cc7 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Wed, 7 Aug 2024 15:26:50 -0700 Subject: [PATCH 07/30] Add sstream header --- filters/include/bloom_filter_impl.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 7186763e..0dcca364 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -21,6 +21,7 @@ #define _BLOOM_FILTER_IMPL_HPP_ #include +#include #include #include "common_defs.hpp" From 814486936438837b86fc32b6aa865f3ecc13cc07 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:25:51 -0700 Subject: [PATCH 08/30] move bit_array_ops functions to be inline, WIP: blooom serialization --- filters/CMakeLists.txt | 1 - filters/include/bit_array_ops.hpp | 79 +++++++-- filters/include/bit_array_ops_impl.hpp | 108 ----------- filters/include/bloom_filter.hpp | 49 +++++ filters/include/bloom_filter_impl.hpp | 236 ++++++++++++++++++++++--- 5 files changed, 333 insertions(+), 140 deletions(-) delete mode 100644 filters/include/bit_array_ops_impl.hpp diff --git a/filters/CMakeLists.txt b/filters/CMakeLists.txt index 45da324b..66bce141 100644 --- a/filters/CMakeLists.txt +++ b/filters/CMakeLists.txt @@ -40,5 +40,4 @@ install(FILES include/bloom_filter_impl.hpp include/bloom_filter_builder_impl.hpp include/bit_array_ops.hpp - include/bit_array_ops_impl.hpp DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/filters/include/bit_array_ops.hpp b/filters/include/bit_array_ops.hpp index 9975d43a..459717ba 100644 --- a/filters/include/bit_array_ops.hpp +++ b/filters/include/bit_array_ops.hpp @@ -40,28 +40,41 @@ namespace bit_array_ops { * @param index the index of the bit to get * @return the value of the bit at the given index. */ - static bool get_bit(uint8_t* array, const uint64_t index); + static inline bool get_bit(uint8_t* array, const uint64_t index) { + return (array[index >> 3] & (1 << (index & 7))) != 0; + } /** * Set the bit at the given index to 1. * @param array the array of bits * @param index the index of the bit to set. */ - static void set_bit(uint8_t* array, const uint64_t index); + static inline void set_bit(uint8_t* array, const uint64_t index) { + array[index >> 3] |= (1 << (index & 7)); + } /** * Set the bit at the given index to 0. * @param array the array of bits * @param index the index of the bit to clear. */ - static void clear_bit(uint8_t* array, const uint64_t index); + static inline void clear_bit(uint8_t* array, const uint64_t index) { + array[index >> 3] &= ~(1 << (index & 7)); + } /** * Assign the value of the bit at the given index. * @param array the array of bits * @param index the index of the bit to set. */ - static void assign_bit(uint8_t* array, const uint64_t index, const bool value); + static inline void assign_bit(uint8_t* array, const uint64_t index, const bool value) { + // read-only checks handled by set_bit() and clear_bit() + if (value) { + set_bit(array, index); + } else { + clear_bit(array, index); + } + } /** * Gets teh value of a bit at the specified index and sets it to true @@ -69,7 +82,16 @@ namespace bit_array_ops { * @param index the index of the bit to get and set * @return the value of the bit at the specified index */ - static bool get_and_set_bit(uint8_t* array, const uint64_t index); + static inline bool get_and_set_bit(uint8_t* array, const uint64_t index) { + const uint64_t offset = index >> 3; + const uint8_t mask = 1 << (index & 7); + if ((array[offset] & mask) != 0) { + return true; + } else { + array[offset] |= mask; + return false; + } + } /** * @brief Gets the number of bits set in the bit array. @@ -77,7 +99,20 @@ namespace bit_array_ops { * @param length_bytes the length of the array, in bytes * @return the number of bits set in the bit array. */ - static uint64_t count_num_bits_set(uint8_t* array, const uint64_t length_bytes); + static inline uint64_t count_num_bits_set(uint8_t* array, const uint64_t length_bytes) { + uint64_t num_bits_set = 0; + + // we rounded up to a multiple of 64 so we know we can use 64-bit operations + const uint64_t* array64 = reinterpret_cast(array); + // Calculate the number of 64-bit chunks + uint64_t num_longs = length_bytes / 8; // 8 bytes per 64 bits + for (uint64_t i = 0; i < num_longs; ++i) { + // Wrap the 64-bit chunk with std::bitset for easy bit counting + std::bitset<64> bits(array64[i]); + num_bits_set += bits.count(); + } + return num_bits_set; + } /** * Performs a union operation on one bit array with another bit array. @@ -89,7 +124,15 @@ namespace bit_array_ops { * @param length_bytes the length of the two arrays, in bytes * @return the number of bits set in the resulting array */ - static uint64_t union_with(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes); + static inline uint64_t union_with(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes) { + uint64_t num_bits_set = 0; + for (uint64_t i = 0; i < length_bytes; ++i) { + tgt[i] |= src[i]; + std::bitset<8> bits(tgt[i]); + num_bits_set += bits.count(); + } + return num_bits_set; + } /** * Performs an intersection operation on one bit array with another bit array. @@ -101,7 +144,15 @@ namespace bit_array_ops { * @param length_bytes the length of the two arrays, in bytes * @return the number of bits set in the resulting array */ - static uint64_t intersect(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes); + static inline uint64_t intersect(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes) { + uint64_t num_bits_set = 0; + for (uint64_t i = 0; i < length_bytes; ++i) { + tgt[i] &= src[i]; + std::bitset<8> bits(tgt[i]); + num_bits_set += bits.count(); + } + return num_bits_set; + } /** * Inverts the bits of this bit array. @@ -110,12 +161,18 @@ namespace bit_array_ops { * @param length_bytes the length of the array, in bytes * @return the number of bits set in the resulting array */ - static uint64_t invert(uint8_t* array, const uint64_t length_bytes); + static inline uint64_t invert(uint8_t* array, const uint64_t length_bytes) { + uint64_t num_bits_set = 0; + for (uint64_t i = 0; i < length_bytes; ++i) { + array[i] = ~array[i]; + std::bitset<8> bits(array[i]); + num_bits_set += bits.count(); + } + return num_bits_set; + } } // namespace bit_array_ops } // namespace datasketches -#include "bit_array_ops_impl.hpp" - #endif // _BIT_ARRAY_OPS_HPP_ \ No newline at end of file diff --git a/filters/include/bit_array_ops_impl.hpp b/filters/include/bit_array_ops_impl.hpp deleted file mode 100644 index e6af4159..00000000 --- a/filters/include/bit_array_ops_impl.hpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifndef _BIT_ARRAY_OPS_IMPL_HPP_ -#define _BIT_ARRAY_OPS_IMPL_HPP_ - -#include - -#include "bit_array_ops.hpp" - -namespace datasketches { - -bool bit_array_ops::get_bit(uint8_t* array, const uint64_t index) { - return (array[index >> 3] & (1 << (index & 7))) != 0; -} - -void bit_array_ops::set_bit(uint8_t* array, const uint64_t index) { - array[index >> 3] |= (1 << (index & 7)); -} - -void bit_array_ops::clear_bit(uint8_t* array, const uint64_t index) { - array[index >> 3] &= ~(1 << (index & 7)); -} - -void bit_array_ops::assign_bit(uint8_t* array, const uint64_t index, const bool value) { - // read-only checks handled by set_bit() and clear_bit() - if (value) { - set_bit(array, index); - } else { - clear_bit(array, index); - } -} - -bool bit_array_ops::get_and_set_bit(uint8_t* array, const uint64_t index) { - const uint64_t offset = index >> 3; - const uint8_t mask = 1 << (index & 7); - if ((array[offset] & mask) != 0) { - return true; - } else { - array[offset] |= mask; - return false; - } -} - -uint64_t bit_array_ops::count_num_bits_set(uint8_t* array, const uint64_t length_bytes) { - uint64_t num_bits_set = 0; - - // we rounded up to a multiple of 64 so we know we can use 64-bit operations - const uint64_t* array64 = reinterpret_cast(array); - // Calculate the number of 64-bit chunks - uint64_t num_longs = length_bytes / 8; // 8 bytes per 64 bits - for (uint64_t i = 0; i < num_longs; ++i) { - // Wrap the 64-bit chunk with std::bitset for easy bit counting - std::bitset<64> bits(array64[i]); - num_bits_set += bits.count(); - } - return num_bits_set; -} - -uint64_t bit_array_ops::union_with(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes) { - uint64_t num_bits_set = 0; - for (uint64_t i = 0; i < length_bytes; ++i) { - tgt[i] |= src[i]; - std::bitset<8> bits(tgt[i]); - num_bits_set += bits.count(); - } - return num_bits_set; -} - -uint64_t bit_array_ops::intersect(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes) { - uint64_t num_bits_set = 0; - for (uint64_t i = 0; i < length_bytes; ++i) { - tgt[i] &= src[i]; - std::bitset<8> bits(tgt[i]); - num_bits_set += bits.count(); - } - return num_bits_set; -} - -uint64_t bit_array_ops::invert(uint8_t* array, const uint64_t length_bytes) { - uint64_t num_bits_set = 0; - for (uint64_t i = 0; i < length_bytes; ++i) { - array[i] = ~array[i]; - std::bitset<8> bits(array[i]); - num_bits_set += bits.count(); - } - return num_bits_set; -} - -} // namespace datasketches - -#endif // _BIT_ARRAY_OPS_IMPL_HPP_ \ No newline at end of file diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index 5e0ed75c..f79fce8c 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -69,6 +69,14 @@ class bloom_filter_alloc { public: + bloom_filter_alloc deserialize(const void* bytes, size_t length_bytes, const Allocator& allocator = Allocator()); + + bloom_filter_alloc deserialize(std::istream& is, const A& allocator = A()); + + bloom_filter_alloc wrap(const void* data, size_t length_bytes, const Allocator& allocator = Allocator()); + + bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, const Allocator& allocator = Allocator()); + /** * Checks if the Bloom Filter has processed any items * @return True if the BloomFilter is empty, otherwise False @@ -438,6 +446,20 @@ class bloom_filter_alloc { // TODO: Serialization + /** + * @brief Checks if the Bloom Filter is read-only. + * + * @return True if the filter is read-only, otherwise false. + */ + bool is_read_only() const; + + /** + * @brief Checks if the Bloom Filter has backing memory. + * + * @return True if the filter has backing memory, otherwise false. + */ + bool has_backing_memory() const; + /** * @brief Returns a human-readable string representation of the Bloom Filter. * @param print_filter If true, the filter bits will be printed as well. @@ -451,16 +473,43 @@ class bloom_filter_alloc { ~bloom_filter_alloc(); private: + static const uint64_t DIRTY_BITS_VALUE = static_cast(-1LL); static const uint64_t MAX_HEADER_SIZE_BYTES = 32; // 4 Java Longs + static const uint64_t BIT_ARRAY_LENGTH_OFFSET_BYTES = 16; + static const uint64_t NUM_BITS_SET_OFFSET_BYTES = 24; + static const uint64_t BIT_ARRAY_OFFSET_BYTES = 32; static const uint64_t MAX_FILTER_SIZE_BITS = (INT32_MAX - MAX_HEADER_SIZE_BYTES) * sizeof(uint64_t); + static const uint8_t FAMILY_ID = 21; + static const uint8_t SER_VER = 1; + static const uint8_t EMPTY_FLAG_MASK = 4; + bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const Allocator& allocator = Allocator()); + bloom_filter_alloc(const uint64_t seed, + const uint16_t num_hashes, + const bool is_dirty, + const bool is_owned, + const bool is_read_only, + const uint64_t capacity_bits, + const uint64_t num_bits_set, + uint8_t* bit_array, + uint8_t* memory, + const Allocator& allocator = Allocator()); + + bloom_filter_alloc internal_deserialize_or_wrap(void* bytes, + size_t length_bytes, + bool read_only, + bool wrap, + const A& allocator); + // internal query/update methods void internal_update(const uint64_t h0, const uint64_t h1); bool internal_query_and_update(const uint64_t h0, const uint64_t h1); bool internal_query(const uint64_t h0, const uint64_t h1) const; + void update_num_bits_set(const uint64_t num_bits_set); + A allocator_; uint64_t seed_; uint16_t num_hashes_; diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 0dcca364..388e83ca 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -26,8 +26,14 @@ #include "common_defs.hpp" #include "bit_array_ops.hpp" +#include "memory_operations.hpp" #include "xxhash64.h" +// memory scenarios: +// * on-heap: owned, bit_array_ set, memory_ null +// * direct: not owned, bit_array_ set, memory_ set +// * read-only an option for direct + namespace datasketches { template @@ -59,23 +65,186 @@ bloom_filter_alloc::bloom_filter_alloc(const uint64_t num_bits, const uint16_ memory_ = nullptr; } +template +bloom_filter_alloc::bloom_filter_alloc(const uint64_t seed, + const uint16_t num_hashes, + const bool is_dirty, + const bool is_owned, + const bool is_read_only, + const uint64_t capacity_bits, + const uint64_t num_bits_set, + uint8_t* bit_array, + uint8_t* memory, + const A& allocator) : + allocator_(allocator), + seed_(seed), + num_hashes_(num_hashes), + is_dirty_(is_dirty), + is_owned_(is_owned), + is_read_only_(is_read_only), + capacity_bits_(capacity_bits), + num_bits_set_(num_bits_set), + bit_array_(bit_array), + memory_(memory) +{ + // no consistency checks since we should have done those during reading + if (is_read_only_ && memory_ != nullptr && num_bits_set == DIRTY_BITS_VALUE) { + num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3); + } +} + +template +bloom_filter_alloc bloom_filter_alloc::deserialize(const void* bytes, size_t length_bytes, const A& allocator) { + return internal_deserialize_or_wrap(bytes, length_bytes, false, false, allocator); +} + +template +bloom_filter_alloc bloom_filter_alloc::deserialize(std::istream& is, const A& allocator) { + const uint8_t prelongs = read(is); + const uint8_t ser_ver = read(is); + const uint8_t family = read(is); + const uint8_t flags = read(is); + + if (prelongs < 1 || prelongs > 4) { + throw std::invalid_argument("Possible corruption: Incorrect number of preamble bytes specified in header"); + } + if (ser_ver != SER_VER) { + throw std::invalid_argument("Possible corruption: Unrecognized serialization version: " + std::to_string(ser_ver)); + } + if (family != FAMILY_ID) { + throw std::invalid_argument("Possible corruption: Incorrect Family ID for bloom filter. Found: " + std::to_string(family)); + } + + const bool is_empty = (flags & EMPTY_FLAG_MASK) != 0; + + const uint16_t num_hashes = read(is); + read(is); // unused + const uint64_t seed = read(is); + const uint64_t num_longs = read(is); // sized in java longs + read(is); // unused + + // if empty, stop reading + if (is_empty) { + return bloom_filter_alloc(num_longs << 6, num_hashes, seed, allocator); + } + + const uint64_t num_bits_set = read(is); + const bool is_dirty = (num_bits_set == DIRTY_BITS_VALUE); + + // allocate memory + const uint64_t num_bytes = num_longs << 3; + uint8_t* bit_array = allocator_.allocate(num_bytes); + if (bit_array == nullptr) { + throw std::bad_alloc(); + } + read(is, bit_array, num_bytes); + + // pass to constructor + return bloom_filter_alloc(seed, num_hashes, is_dirty, false, false, num_longs << 6, num_bits_set, bit_array, nullptr, allocator); +} + +template +bloom_filter_alloc bloom_filter_alloc::wrap(const void* bytes, size_t length_bytes, const A& allocator) { + // read-only flag means we won't modify the memory, but cast away the const + return internal_deserialize_or_wrap(const_cast(bytes), length_bytes, true, true, allocator); +} + +template +bloom_filter_alloc bloom_filter_alloc::writable_wrap(void* bytes, size_t length_bytes, const A& allocator) { + return internal_deserialize_or_wrap(bytes, length_bytes, false, true, allocator); +} + +template +bloom_filter_alloc bloom_filter_alloc::internal_deserialize_or_wrap(void* bytes, + size_t length_bytes, + bool read_only, + bool wrap, + const A& allocator) +{ + ensure_minimum_memory(length_bytes, 8); + if (bytes == nullptr) { + throw std::invalid_argument("Input data is null or empty"); + } + const uint8_t* ptr = static_cast(bytes); + const uint8_t* end_ptr = ptr + length_bytes; + const uint8_t prelongs = *ptr++; + const uint8_t ser_ver = *ptr++; + const uint8_t family = *ptr++; + const uint8_t flags = *ptr++; + + if (prelongs < 1 || prelongs > 4) { + throw std::invalid_argument("Possible corruption: Incorrect number of preamble bytes specified in header"); + } + if (ser_ver != SER_VER) { + throw std::invalid_argument("Possible corruption: Unrecognized serialization version: " + std::to_string(ser_ver)); + } + if (family != FAMILY_ID) { + throw std::invalid_argument("Possible corruption: Incorrect Family ID for bloom filter. Found: " + std::to_string(family)); + } + + const bool is_empty = (flags & EMPTY_FLAG_MASK) != 0; + + ensure_minimum_memory(length_bytes, prelongs * sizeof(uint64_t)); + + uint16_t num_hashes; + ptr += copy_from_mem(ptr, num_hashes); + ptr += sizeof(uint16_t); // 16 bits unused after num_hashes + uint64_t seed; + ptr += copy_from_mem(ptr, seed); + + uint64_t num_longs; + ptr += copy_from_mem(ptr, num_longs); // sized in java longs + ptr += sizeof(sizeof(uint32_t)); // unused 32 bits follow + + // if empty, stop reading + if (wrap && is_empty && !read_only) { + throw std::invalid_argument("Cannot wrap an empty filter for writing"); + } else if (is_empty) { + return bloom_filter_alloc(num_longs << 6, num_hashes, seed, allocator); + } + + uint64_t num_bits_set; + ptr += copy_from_mem(ptr, num_bits_set); + const bool is_dirty = (num_bits_set == DIRTY_BITS_VALUE); + + uint8_t* bit_array; + uint8_t* memory; + if (wrap) { + memory = static_cast(bytes); + bit_array = memory + BIT_ARRAY_OFFSET_BYTES; + } else { + // allocate memory + memory = nullptr; + const uint64_t num_bytes = num_longs << 3; + ensure_minimum_memory(end_ptr - ptr, num_bytes); + bit_array = allocator_.allocate(num_bytes); + if (bit_array == nullptr) { + throw std::bad_alloc(); + } + copy_from_mem(ptr, bit_array, num_bytes); + } + + // pass to constructor + return bloom_filter_alloc(seed, num_hashes, is_dirty, wrap, read_only, num_longs << 6, num_bits_set, bit_array, memory, allocator); +} + template bloom_filter_alloc::~bloom_filter_alloc() { - // TODO: handle when only bit_array_ is used - // TODO: handle when memory_ is used and bit_array_ is a pointer into it - /* - if (is_owned_ && memory_ != nullptr) { - allocator_.deallocate(memory_, capacity_bits_ >> 3); + if (is_owned_) { + if (memory_ != nullptr) { + // deallocate total memory_ block, including preamble + allocator_.deallocate(memory_, (capacity_bits_ >> 3) + BIT_ARRAY_OFFSET_BYTES); + } else if (bit_array_ != nullptr) { + // only need to deallocate bit_array_ + allocator_.deallocate(bit_array_, capacity_bits_ >> 3); + } memory_ = nullptr; - bit_array_ = nullptr; // just to be safe - } - */ - if (memory_ == nullptr && bit_array_ != nullptr) { - allocator_.deallocate(bit_array_, capacity_bits_ >> 3); bit_array_ = nullptr; } } + + template bool bloom_filter_alloc::is_empty() const { return !is_dirty_ && num_bits_set_ == 0; @@ -105,12 +274,32 @@ uint64_t bloom_filter_alloc::get_seed() const { return seed_; } +template +bool bloom_filter_alloc::is_read_only() const { + return is_read_only_; +} + +template +bool bloom_filter_alloc::has_backing_memory() const { + return memory_ != nullptr; +} + template void bloom_filter_alloc::reset() { - // TODO: if wrapped, update num_bits_set in memory, too - num_bits_set_ = 0; + if (is_read_only_) { + throw std::logic_error("Cannot reset a read-only filter"); + } + update_num_bits_set(0); + std::fill_n(bit_array_, capacity_bits_ >> 3, 0); +} + +template +void bloom_filter_alloc::update_num_bits_set(uint64_t num_bits_set) { + num_bits_set_ = num_bits_set; is_dirty_ = false; - std::fill(bit_array_, bit_array_ + (capacity_bits_ >> 3), 0); + if (memory_ != nullptr && !is_read_only_) { + copy_to_mem(num_bits_set_, memory_ + NUM_BITS_SET_OFFSET_BYTES); + } } // UPDATE METHODS @@ -199,6 +388,9 @@ void bloom_filter_alloc::update(const void* item, size_t size) { template void bloom_filter_alloc::internal_update(const uint64_t h0, const uint64_t h1) { + if (is_read_only_) { + throw std::logic_error("Cannot update a read-only filter"); + } const uint64_t num_bits = get_capacity(); for (uint16_t i = 1; i <= num_hashes_; i++) { const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; @@ -293,12 +485,15 @@ bool bloom_filter_alloc::query_and_update(const void* item, size_t size) { template bool bloom_filter_alloc::internal_query_and_update(const uint64_t h0, const uint64_t h1) { + if (is_read_only_) { + throw std::logic_error("Cannot update a read-only filter"); + } const uint64_t num_bits = get_capacity(); bool value_exists = true; for (uint16_t i = 1; i <= num_hashes_; i++) { const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; bool value = bit_array_ops::get_and_set_bit(bit_array_, hash_index); - num_bits_set_ += value ? 0 : 1; + update_num_bits_set(num_bits_set_ + (value ? 0 : 1)); value_exists &= value; } return value_exists; @@ -390,6 +585,7 @@ bool bloom_filter_alloc::query(const void* item, size_t size) const { template bool bloom_filter_alloc::internal_query(const uint64_t h0, const uint64_t h1) const { + if (is_empty()) return false; const uint64_t num_bits = get_capacity(); for (uint16_t i = 1; i <= num_hashes_; i++) { const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits; @@ -414,8 +610,8 @@ void bloom_filter_alloc::union_with(const bloom_filter_alloc& other) { if (!is_compatible(other)) { throw std::invalid_argument("Incompatible bloom filters"); } - num_bits_set_ = bit_array_ops::union_with(bit_array_, other.bit_array_, capacity_bits_ >> 3); - is_dirty_ = false; + uint64_t bits_set = bit_array_ops::union_with(bit_array_, other.bit_array_, capacity_bits_ >> 3); + update_num_bits_set(bits_set); } template @@ -423,14 +619,14 @@ void bloom_filter_alloc::intersect(const bloom_filter_alloc& other) { if (!is_compatible(other)) { throw std::invalid_argument("Incompatible bloom filters"); } - num_bits_set_ = bit_array_ops::intersect(bit_array_, other.bit_array_, capacity_bits_ >> 3); - is_dirty_ = false; + uint64_t bits_set = bit_array_ops::intersect(bit_array_, other.bit_array_, capacity_bits_ >> 3); + update_num_bits_set(bits_set); } template void bloom_filter_alloc::invert() { - num_bits_set_ = bit_array_ops::invert(bit_array_, capacity_bits_ >> 3); - is_dirty_ = false; + uint64_t bits_set = bit_array_ops::invert(bit_array_, capacity_bits_ >> 3); + update_num_bits_set(bits_set); } template From 687611e52f827d5632f516a6f7acb849b3b7f0b5 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:32:56 -0700 Subject: [PATCH 09/30] add missing bitset header --- filters/include/bit_array_ops.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/filters/include/bit_array_ops.hpp b/filters/include/bit_array_ops.hpp index 459717ba..cc1a42fa 100644 --- a/filters/include/bit_array_ops.hpp +++ b/filters/include/bit_array_ops.hpp @@ -20,6 +20,8 @@ #ifndef _BIT_ARRAY_OPS_HPP_ #define _BIT_ARRAY_OPS_HPP_ +#include + namespace datasketches { /** From e8966ada78395b9cff614d3e03a1e5a30d8b17ab Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Fri, 9 Aug 2024 22:48:52 -0700 Subject: [PATCH 10/30] WIP: start testing serialization. no wrapping yet --- filters/include/bloom_filter.hpp | 42 ++++++--- filters/include/bloom_filter_impl.hpp | 130 ++++++++++++++++++++------ filters/test/bloom_filter_test.cpp | 70 ++++++++++++++ 3 files changed, 205 insertions(+), 37 deletions(-) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index f79fce8c..9871c308 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -69,13 +69,18 @@ class bloom_filter_alloc { public: - bloom_filter_alloc deserialize(const void* bytes, size_t length_bytes, const Allocator& allocator = Allocator()); + using vector_bytes = std::vector::template rebind_alloc>; + vector_bytes serialize(unsigned header_size_bytes = 0) const; - bloom_filter_alloc deserialize(std::istream& is, const A& allocator = A()); + void serialize(std::ostream& os) const; - bloom_filter_alloc wrap(const void* data, size_t length_bytes, const Allocator& allocator = Allocator()); + static bloom_filter_alloc deserialize(const void* bytes, size_t length_bytes, const Allocator& allocator = Allocator()); - bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, const Allocator& allocator = Allocator()); + static bloom_filter_alloc deserialize(std::istream& is, const A& allocator = Allocator()); + + static const bloom_filter_alloc wrap(const void* data, size_t length_bytes, const Allocator& allocator = Allocator()); + + static bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, const Allocator& allocator = Allocator()); /** * Checks if the Bloom Filter has processed any items @@ -460,6 +465,19 @@ class bloom_filter_alloc { */ bool has_backing_memory() const; + /** + * @brief Gets the serialized size of the Bloom Filter in bytes + * @return The serialized size of the Bloom Filter in bytes + */ + size_t get_serialized_size_bytes() const; + + /** + * @brief Gets the serialized size of the Bloom Filter with the given number of bits, in bytes + * @param num_bits The number of bits in the Bloom Filter for the size calculation + * @return The serialized size of a Bloom Filter with a capacity of num_bits, in bytes + */ + static size_t get_serialized_size_bytes(const uint64_t num_bits); + /** * @brief Returns a human-readable string representation of the Bloom Filter. * @param print_filter If true, the filter bits will be printed as well. @@ -480,11 +498,13 @@ class bloom_filter_alloc { static const uint64_t BIT_ARRAY_OFFSET_BYTES = 32; static const uint64_t MAX_FILTER_SIZE_BITS = (INT32_MAX - MAX_HEADER_SIZE_BYTES) * sizeof(uint64_t); + static const uint8_t PREAMBLE_LONGS_EMPTY = 3; + static const uint8_t PREAMBLE_LONGS_STANDARD = 4; static const uint8_t FAMILY_ID = 21; static const uint8_t SER_VER = 1; static const uint8_t EMPTY_FLAG_MASK = 4; - bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const Allocator& allocator = Allocator()); + bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const A& allocator); bloom_filter_alloc(const uint64_t seed, const uint16_t num_hashes, @@ -495,13 +515,13 @@ class bloom_filter_alloc { const uint64_t num_bits_set, uint8_t* bit_array, uint8_t* memory, - const Allocator& allocator = Allocator()); + const A& allocator); - bloom_filter_alloc internal_deserialize_or_wrap(void* bytes, - size_t length_bytes, - bool read_only, - bool wrap, - const A& allocator); + static bloom_filter_alloc internal_deserialize_or_wrap(void* bytes, + size_t length_bytes, + bool read_only, + bool wrap, + const A& allocator); // internal query/update methods void internal_update(const uint64_t h0, const uint64_t h1); diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 388e83ca..63ebea36 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -93,9 +93,28 @@ bloom_filter_alloc::bloom_filter_alloc(const uint64_t seed, } } +template +bloom_filter_alloc::~bloom_filter_alloc() { + if (is_owned_) { + if (memory_ != nullptr) { + // deallocate total memory_ block, including preamble + allocator_.deallocate(memory_, (capacity_bits_ >> 3) + BIT_ARRAY_OFFSET_BYTES); + } else if (bit_array_ != nullptr) { + // only need to deallocate bit_array_ + allocator_.deallocate(bit_array_, capacity_bits_ >> 3); + } + memory_ = nullptr; + bit_array_ = nullptr; + } +} + +// TODO: copy, move constructors +// TODO: copy, move assignment operators + template bloom_filter_alloc bloom_filter_alloc::deserialize(const void* bytes, size_t length_bytes, const A& allocator) { - return internal_deserialize_or_wrap(bytes, length_bytes, false, false, allocator); + // not wrapping so we can cast away const as we're not modifying the memory + return internal_deserialize_or_wrap(const_cast(bytes), length_bytes, false, false, allocator); } template @@ -120,7 +139,7 @@ bloom_filter_alloc bloom_filter_alloc::deserialize(std::istream& is, const const uint16_t num_hashes = read(is); read(is); // unused const uint64_t seed = read(is); - const uint64_t num_longs = read(is); // sized in java longs + const uint32_t num_longs = read(is); // sized in java longs read(is); // unused // if empty, stop reading @@ -133,7 +152,8 @@ bloom_filter_alloc bloom_filter_alloc::deserialize(std::istream& is, const // allocate memory const uint64_t num_bytes = num_longs << 3; - uint8_t* bit_array = allocator_.allocate(num_bytes); + A alloc(allocator); + uint8_t* bit_array = alloc.allocate(num_bytes); if (bit_array == nullptr) { throw std::bad_alloc(); } @@ -144,9 +164,9 @@ bloom_filter_alloc bloom_filter_alloc::deserialize(std::istream& is, const } template -bloom_filter_alloc bloom_filter_alloc::wrap(const void* bytes, size_t length_bytes, const A& allocator) { +const bloom_filter_alloc bloom_filter_alloc::wrap(const void* bytes, size_t length_bytes, const A& allocator) { // read-only flag means we won't modify the memory, but cast away the const - return internal_deserialize_or_wrap(const_cast(bytes), length_bytes, true, true, allocator); + return const_cast>(internal_deserialize_or_wrap(const_cast(bytes), length_bytes, true, true, allocator)); } template @@ -172,7 +192,7 @@ bloom_filter_alloc bloom_filter_alloc::internal_deserialize_or_wrap(void* const uint8_t family = *ptr++; const uint8_t flags = *ptr++; - if (prelongs < 1 || prelongs > 4) { + if (prelongs < PREAMBLE_LONGS_EMPTY || prelongs > PREAMBLE_LONGS_STANDARD) { throw std::invalid_argument("Possible corruption: Incorrect number of preamble bytes specified in header"); } if (ser_ver != SER_VER) { @@ -192,9 +212,9 @@ bloom_filter_alloc bloom_filter_alloc::internal_deserialize_or_wrap(void* uint64_t seed; ptr += copy_from_mem(ptr, seed); - uint64_t num_longs; + uint32_t num_longs; ptr += copy_from_mem(ptr, num_longs); // sized in java longs - ptr += sizeof(sizeof(uint32_t)); // unused 32 bits follow + ptr += sizeof(uint32_t); // unused 32 bits follow // if empty, stop reading if (wrap && is_empty && !read_only) { @@ -217,7 +237,8 @@ bloom_filter_alloc bloom_filter_alloc::internal_deserialize_or_wrap(void* memory = nullptr; const uint64_t num_bytes = num_longs << 3; ensure_minimum_memory(end_ptr - ptr, num_bytes); - bit_array = allocator_.allocate(num_bytes); + A alloc(allocator); + bit_array = alloc.allocate(num_bytes); if (bit_array == nullptr) { throw std::bad_alloc(); } @@ -229,21 +250,72 @@ bloom_filter_alloc bloom_filter_alloc::internal_deserialize_or_wrap(void* } template -bloom_filter_alloc::~bloom_filter_alloc() { - if (is_owned_) { - if (memory_ != nullptr) { - // deallocate total memory_ block, including preamble - allocator_.deallocate(memory_, (capacity_bits_ >> 3) + BIT_ARRAY_OFFSET_BYTES); - } else if (bit_array_ != nullptr) { - // only need to deallocate bit_array_ - allocator_.deallocate(bit_array_, capacity_bits_ >> 3); - } - memory_ = nullptr; - bit_array_ = nullptr; +void bloom_filter_alloc::serialize(std::ostream& os) const { + const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD; + write(os, preamble_longs); + const uint8_t serial_version = SER_VER; + write(os, serial_version); + const uint8_t family = FAMILY_ID; + write(os, family); + const uint8_t flags_byte = is_empty() ? EMPTY_FLAG_MASK : 0; + write(os, flags_byte); + + write(os, num_hashes_); + write(os, static_cast(0)); // 2 bytes unused + write(os, seed_); + write(os, static_cast(capacity_bits_ >> 6)); // sized in java longs + write(os, static_cast(0)); // 4 bytes unused + + if (!is_empty()) { + write(os, is_dirty_ ? DIRTY_BITS_VALUE : num_bits_set_); + write(os, bit_array_, capacity_bits_ >> 3); } + + os.flush(); } +template +auto bloom_filter_alloc::serialize(unsigned header_size_bytes) const -> vector_bytes { + const size_t size = header_size_bytes + get_serialized_size_bytes(); + vector_bytes bytes(size, 0, allocator_); + uint8_t* ptr = bytes.data() + header_size_bytes; + + const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD; + ptr += copy_to_mem(preamble_longs, ptr); + const uint8_t serial_version = SER_VER; + ptr += copy_to_mem(serial_version, ptr); + const uint8_t family = FAMILY_ID; + ptr += copy_to_mem(family, ptr); + const uint8_t flags_byte = is_empty() ? EMPTY_FLAG_MASK : 0; + ptr += copy_to_mem(flags_byte, ptr); + ptr += copy_to_mem(num_hashes_, ptr); + ptr += copy_to_mem(static_cast(0), ptr); // 2 bytes unused + ptr += copy_to_mem(seed_, ptr); + ptr += copy_to_mem(static_cast(capacity_bits_ >> 6), ptr); // sized in java longs + ptr += copy_to_mem(static_cast(0), ptr); // 4 bytes unused + + if (!is_empty()) { + ptr += copy_to_mem(is_dirty_ ? DIRTY_BITS_VALUE : num_bits_set_, ptr); + ptr += copy_to_mem(bit_array_, ptr, capacity_bits_ >> 3); + } + + return bytes; +} + +template +size_t bloom_filter_alloc::get_serialized_size_bytes() const { + return sizeof(uint64_t) * (is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD + (capacity_bits_ >> 6)); +} + +template +size_t bloom_filter_alloc::get_serialized_size_bytes(uint64_t num_bits) { + if (num_bits == 0) + throw std::invalid_argument("Number of bits must be greater than zero"); + + size_t num_bytes = ((num_bits + 63) >> 6) << 6; + return sizeof(uint64_t) * (PREAMBLE_LONGS_STANDARD + num_bytes); +} template bool bloom_filter_alloc::is_empty() const { @@ -559,11 +631,11 @@ bool bloom_filter_alloc::query(const double item) const { int64_t long_value; double double_value; } ldu; - ldu.doubleBytes = static_cast(item); + ldu.double_value = static_cast(item); if (item == 0.0) { - ldu.doubleBytes = 0.0; // canonicalize -0.0 to 0.0 - } else if (std::isnan(ldu.doubleBytes)) { - ldu.longBytes = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() + ldu.double_value = 0.0; // canonicalize -0.0 to 0.0 + } else if (std::isnan(ldu.double_value)) { + ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() } const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_); const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0); @@ -634,12 +706,18 @@ string bloom_filter_alloc::to_string(bool print_filter) const { // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements. // The stream does not support passing an allocator instance, and alternatives are complicated. std::ostringstream oss; + uint64_t num_bits_set = num_bits_set_; + if (is_dirty_) { + num_bits_set = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3); + } + oss << "### Bloom Filter Summary:" << std::endl; oss << " num_bits : " << get_capacity() << std::endl; oss << " num_hashes : " << num_hashes_ << std::endl; oss << " seed : " << seed_ << std::endl; - oss << " bits_used : " << get_bits_used() << std::endl; - oss << " fill % : " << (get_bits_used() * 100.0) / get_capacity() << std::endl; + oss << " is_dirty : " << (is_dirty_ ? "true" : "false") << std::endl; + oss << " bits_used : " << num_bits_set << std::endl; + oss << " fill % : " << (num_bits_set * 100.0) / get_capacity() << std::endl; oss << "### End filter summary" << std::endl; if (print_filter) { diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index dee240bf..3019783c 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -201,6 +201,76 @@ TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") { REQUIRE(num_found < num_bits / 10); // not being super strict } +/* +TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") { + const uint64_t num_bits = 32769; + const uint16_t num_hashes = 7; + + auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes); + auto bytes = bf.serialize(); + REQUIRE(bytes.size() == bf.get_serialized_size_bytes()); + + auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size()); + REQUIRE(bf.get_capacity() == bf_bytes.get_capacity()); + REQUIRE(bf.get_seed() == bf_bytes.get_seed()); + REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes()); + REQUIRE(bf_bytes.is_empty()); + + std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary); + bf.serialize(ss); + auto bf_stream = bloom_filter::deserialize(ss); + REQUIRE(bf.get_capacity() == bf_stream.get_capacity()); + REQUIRE(bf.get_seed() == bf_stream.get_seed()); + REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes()); + REQUIRE(bf_stream.is_empty()); +} +*/ +TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { + const uint64_t num_bits = 32768; + const uint16_t num_hashes = 5; + auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes); + const uint64_t n = 1000; + for (uint64_t i = 0; i < n; ++i) { + bf.update(0.5 + i); // testing floats + } + + // test more items without updating, assuming some false positives + int count = 0; + for (uint64_t i = n; i < num_bits; ++i) { + count += bf.query(0.5 + i) ? 1 : 0; + } + + auto bytes = bf.serialize(); + REQUIRE(bytes.size() == bf.get_serialized_size_bytes()); + + auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size()); + REQUIRE(bf.get_capacity() == bf_bytes.get_capacity()); + REQUIRE(bf.get_seed() == bf_bytes.get_seed()); + REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes()); + REQUIRE(!bf_bytes.is_empty()); + int count_bytes = 0; + for (uint64_t i = 0; i < num_bits; ++i) { + bool val = bf_bytes.query(0.5 + i); + if (val) ++count_bytes; + if (i < n) REQUIRE(val); + } + REQUIRE(count_bytes == n + count); + + std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary); + bf.serialize(ss); + auto bf_stream = bloom_filter::deserialize(ss); + REQUIRE(bf.get_capacity() == bf_stream.get_capacity()); + REQUIRE(bf.get_seed() == bf_stream.get_seed()); + REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes()); + REQUIRE(!bf_stream.is_empty()); + int count_stream = 0; + for (uint64_t i = 0; i < num_bits; ++i) { + bool val = bf_stream.query(0.5 + i); + if (val) ++count_stream; + if (i < n) REQUIRE(val); + } + REQUIRE(count_stream == n + count); +} } // namespace datasketches From 4f7801cc1dbb162aeeb7defd12efea3b721d0e80 Mon Sep 17 00:00:00 2001 From: Jon Date: Mon, 12 Aug 2024 12:23:05 -0700 Subject: [PATCH 11/30] fix types --- filters/test/bloom_filter_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index 3019783c..3e3ca31d 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -249,7 +249,7 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { REQUIRE(bf.get_seed() == bf_bytes.get_seed()); REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes()); REQUIRE(!bf_bytes.is_empty()); - int count_bytes = 0; + uint64_t count_bytes = 0; for (uint64_t i = 0; i < num_bits; ++i) { bool val = bf_bytes.query(0.5 + i); if (val) ++count_bytes; @@ -264,7 +264,7 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { REQUIRE(bf.get_seed() == bf_stream.get_seed()); REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes()); REQUIRE(!bf_stream.is_empty()); - int count_stream = 0; + uint64_t count_stream = 0; for (uint64_t i = 0; i < num_bits; ++i) { bool val = bf_stream.query(0.5 + i); if (val) ++count_stream; From df35436aefda354289a0adbd34dfe4738bf0f536 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Mon, 12 Aug 2024 12:29:36 -0700 Subject: [PATCH 12/30] improve tests around counting false positive for consistency --- filters/test/bloom_filter_test.cpp | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index 3e3ca31d..ce50348b 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -201,7 +201,7 @@ TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") { REQUIRE(num_found < num_bits / 10); // not being super strict } -/* + TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") { const uint64_t num_bits = 32769; const uint16_t num_hashes = 7; @@ -224,7 +224,7 @@ TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") { REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes()); REQUIRE(bf_stream.is_empty()); } -*/ + TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { const uint64_t num_bits = 32768; const uint16_t num_hashes = 5; @@ -236,9 +236,11 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { } // test more items without updating, assuming some false positives - int count = 0; + // so we can check that we get the same number of false positives + // with the same query items + uint64_t fp_count = 0; for (uint64_t i = n; i < num_bits; ++i) { - count += bf.query(0.5 + i) ? 1 : 0; + fp_count += bf.query(0.5 + i) ? 1 : 0; } auto bytes = bf.serialize(); @@ -249,13 +251,15 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { REQUIRE(bf.get_seed() == bf_bytes.get_seed()); REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes()); REQUIRE(!bf_bytes.is_empty()); - uint64_t count_bytes = 0; + uint64_t fp_count_bytes = 0; for (uint64_t i = 0; i < num_bits; ++i) { bool val = bf_bytes.query(0.5 + i); - if (val) ++count_bytes; - if (i < n) REQUIRE(val); + if (i < n) + REQUIRE(val); + else if (val) + ++fp_count_bytes; } - REQUIRE(count_bytes == n + count); + REQUIRE(fp_count_bytes == fp_count); std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary); bf.serialize(ss); @@ -264,13 +268,15 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { REQUIRE(bf.get_seed() == bf_stream.get_seed()); REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes()); REQUIRE(!bf_stream.is_empty()); - uint64_t count_stream = 0; + uint64_t fp_count_stream = 0; for (uint64_t i = 0; i < num_bits; ++i) { bool val = bf_stream.query(0.5 + i); - if (val) ++count_stream; - if (i < n) REQUIRE(val); + if (i < n) + REQUIRE(val); + else if (val) + ++fp_count_stream; } - REQUIRE(count_stream == n + count); + REQUIRE(fp_count_stream == fp_count); } } // namespace datasketches From 3b1d10b1f6819c88d0307359e9e14361dfb6c445 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Mon, 12 Aug 2024 17:58:26 -0700 Subject: [PATCH 13/30] partial testing of initialize, no test of wrapping yet --- filters/include/bloom_filter.hpp | 40 +++++-- filters/include/bloom_filter_builder_impl.hpp | 62 ++++++----- filters/include/bloom_filter_impl.hpp | 103 ++++++++++++++---- filters/test/bloom_filter_test.cpp | 87 ++++++++++++--- 4 files changed, 221 insertions(+), 71 deletions(-) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index 9871c308..d2fca005 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -3,6 +3,7 @@ * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at @@ -47,20 +48,34 @@ class bloom_filter_builder_alloc { static bloom_filter_alloc create_by_accuracy(const uint64_t num_distinct_items, const double target_false_positive_prob, - const Allocator& allocator = Allocator()); - static bloom_filter_alloc create_by_accuracy(const uint64_t num_distinct_items, - const double target_false_positive_prob, - const uint64_t seed, + const uint64_t seed = generate_random_seed(), const Allocator& allocator = Allocator()); static bloom_filter_alloc create_by_size(const uint64_t num_bits, const uint16_t num_hashes, - const Allocator& allocator = Allocator()); - static bloom_filter_alloc create_by_size(const uint64_t num_bits, - const uint16_t num_hashes, - const uint64_t seed, + const uint64_t seed = generate_random_seed(), const Allocator& allocator = Allocator()); + static bloom_filter_alloc initialize_by_accuracy(void* memory, + const size_t length_bytes, + const uint64_t num_distinct_items, + const double target_false_positive_prob, + const uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + static bloom_filter_alloc initialize_by_size(void* memory, + const size_t length_bytes, + const uint64_t num_bits, + const uint16_t num_hashes, + const uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + /** + * @brief Generates a random 64-bit seed value + * + * @return uint64_t a random value over the range of unsigned 64-bit integers + */ + static uint64_t generate_random_seed(); }; template> @@ -465,6 +480,12 @@ class bloom_filter_alloc { */ bool has_backing_memory() const; + /** + * @brief Returns a pointer to the backing memory, if it exists. + * @return A pointer to the backing memory, or nullptr if it does not exist. + */ + const uint8_t* get_backing_memory() const; + /** * @brief Gets the serialized size of the Bloom Filter in bytes * @return The serialized size of the Bloom Filter in bytes @@ -504,8 +525,11 @@ class bloom_filter_alloc { static const uint8_t SER_VER = 1; static const uint8_t EMPTY_FLAG_MASK = 4; + // used by builder methods bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const A& allocator); + bloom_filter_alloc(uint8_t* memory, size_t length_bytes, const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const A& allocator); + // used by deserialize and wrap bloom_filter_alloc(const uint64_t seed, const uint16_t num_hashes, const bool is_dirty, diff --git a/filters/include/bloom_filter_builder_impl.hpp b/filters/include/bloom_filter_builder_impl.hpp index a176aadf..6c16bb29 100644 --- a/filters/include/bloom_filter_builder_impl.hpp +++ b/filters/include/bloom_filter_builder_impl.hpp @@ -29,6 +29,16 @@ namespace datasketches { +template +uint64_t bloom_filter_builder_alloc::generate_random_seed() { + union { + uint64_t long_value; + double double_value; + } ldu; + ldu.double_value = random_utils::next_double(random_utils::rand); + return ldu.long_value; +} + template uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const uint64_t num_distinct_items, const uint64_t num_filter_bits) { @@ -38,29 +48,17 @@ uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const uint64_t num_di template uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const double target_false_positive_prob) { + // TODO: validate inputs return static_cast(std::ceil(-log(target_false_positive_prob) / log(2.0))); } template uint64_t bloom_filter_builder_alloc::suggest_num_filter_bits(const uint64_t max_distinct_items, const double target_false_positive_prob) { + // TODO: validate inputs return static_cast(std::ceil(-static_cast(max_distinct_items) * log(target_false_positive_prob) / (log(2.0) * log(2.0)))); } - -template -bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(const uint64_t num_distinct_items, - const double target_false_positive_prob, - const A& allocator) { - union { - int64_t long_value; - double double_value; - } ldu; - ldu.double_value = random_utils::next_double(random_utils::rand); - const uint64_t seed = ldu.long_value; - return create_by_accuracy(num_distinct_items, target_false_positive_prob, seed, allocator); -} - template bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(const uint64_t num_distinct_items, const double target_false_positive_prob, @@ -74,22 +72,34 @@ bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(const ui template bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(const uint64_t num_bits, const uint16_t num_hashes, + const uint64_t seed, const A& allocator) { - union { - int64_t long_value; - double double_value; - } ldu; - ldu.double_value = random_utils::next_double(random_utils::rand); - const uint64_t seed = ldu.long_value; - return create_by_size(num_bits, num_hashes, seed, allocator); + // TODO: validate inputs + return bloom_filter_alloc(num_bits, num_hashes, seed, allocator); } template -bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(const uint64_t num_bits, - const uint16_t num_hashes, - const uint64_t seed, - const A& allocator) { - return bloom_filter_alloc(num_bits, num_hashes, seed, allocator); +bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_accuracy(void* memory, + const size_t length_bytes, + const uint64_t num_distinct_items, + const double target_false_positive_prob, + const uint64_t seed, + const A& allocator) { + // TODO: validate inputs + const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(num_distinct_items, target_false_positive_prob); + const uint16_t num_hashes = bloom_filter_builder_alloc::suggest_num_hashes(target_false_positive_prob); + return bloom_filter_alloc(static_cast(memory), length_bytes, num_filter_bits, num_hashes, seed, allocator); +} + +template +bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_size(void* memory, + const size_t length_bytes, + const uint64_t num_bits, + const uint16_t num_hashes, + const uint64_t seed, + const A& allocator) { + // TODO: validate inputs + return bloom_filter_alloc(static_cast(memory), length_bytes, num_bits, num_hashes, seed, allocator); } } // namespace datasketches diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 63ebea36..3c33ea7c 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -44,26 +44,79 @@ bloom_filter_alloc::bloom_filter_alloc(const uint64_t num_bits, const uint16_ is_dirty_(false), is_owned_(true), is_read_only_(false), - capacity_bits_(((num_bits + 63) >> 6) << 6), // can round to nearest multiple of 64 prior to bounds checks + capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks num_bits_set_(0) - { - if (num_hashes == 0) { - throw std::invalid_argument("Must have at least 1 hash function"); - } - if (num_bits == 0) { - throw std::invalid_argument("Number of bits must be greater than zero"); - } else if (num_bits > MAX_FILTER_SIZE_BITS) { - throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits"); - } +{ + if (num_hashes == 0) { + throw std::invalid_argument("Must have at least 1 hash function"); + } + if (num_bits == 0) { + throw std::invalid_argument("Number of bits must be greater than zero"); + } else if (num_bits > MAX_FILTER_SIZE_BITS) { + throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits"); + } - const uint64_t num_bytes = capacity_bits_ >> 3; - bit_array_ = allocator_.allocate(num_bytes); - std::fill_n(bit_array_, num_bytes, 0); - if (bit_array_ == nullptr) { - throw std::bad_alloc(); - } - memory_ = nullptr; + const uint64_t num_bytes = capacity_bits_ >> 3; + bit_array_ = allocator_.allocate(num_bytes); + std::fill_n(bit_array_, num_bytes, 0); + if (bit_array_ == nullptr) { + throw std::bad_alloc(); } + memory_ = nullptr; +} + +template +bloom_filter_alloc::bloom_filter_alloc(uint8_t* memory, + size_t length_bytes, + const uint64_t num_bits, + const uint16_t num_hashes, + const uint64_t seed, + const A& allocator) : + allocator_(allocator), + seed_(seed), + num_hashes_(num_hashes), + is_dirty_(false), + is_owned_(false), + is_read_only_(false), + capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks + num_bits_set_(0) +{ + if (num_hashes == 0) { + throw std::invalid_argument("Must have at least 1 hash function"); + } + if (num_bits == 0) { + throw std::invalid_argument("Number of bits must be greater than zero"); + } else if (num_bits > MAX_FILTER_SIZE_BITS) { + throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits"); + } + + const size_t num_bytes = get_serialized_size_bytes(capacity_bits_); + if (length_bytes < num_bytes) { + throw std::invalid_argument("Input memory block is too small"); + } + + // fill in header info + uint8_t* ptr = memory; + const uint8_t preamble_longs = PREAMBLE_LONGS_STANDARD; // no resizing so assume non-empty + ptr += copy_to_mem(preamble_longs, ptr); + const uint8_t serial_version = SER_VER; + ptr += copy_to_mem(serial_version, ptr); + const uint8_t family = FAMILY_ID; + ptr += copy_to_mem(family, ptr); + const uint8_t flags_byte = 0; // again, assuming non-empty + ptr += copy_to_mem(flags_byte, ptr); + + ptr += copy_to_mem(num_hashes_, ptr); + ptr += copy_to_mem(static_cast(0), ptr); // 2 bytes unused + ptr += copy_to_mem(seed_, ptr); + ptr += copy_to_mem(static_cast(capacity_bits_ >> 6), ptr); // sized in java longs + ptr += copy_to_mem(static_cast(0), ptr); // 4 bytes unused + + // rest of memory is num bits and bit array, so start with zeroes + std::fill_n(ptr, sizeof(uint64_t) * ((capacity_bits_ >> 6) + 1), 0); + bit_array_ = memory + BIT_ARRAY_OFFSET_BYTES; + memory_ = memory; +} template bloom_filter_alloc::bloom_filter_alloc(const uint64_t seed, @@ -82,12 +135,13 @@ bloom_filter_alloc::bloom_filter_alloc(const uint64_t seed, is_dirty_(is_dirty), is_owned_(is_owned), is_read_only_(is_read_only), - capacity_bits_(capacity_bits), + capacity_bits_((capacity_bits + 63) & ~0x3F), num_bits_set_(num_bits_set), bit_array_(bit_array), memory_(memory) { - // no consistency checks since we should have done those during reading + // private constructor + // no consistency checks since we should have done those prior to calling this if (is_read_only_ && memory_ != nullptr && num_bits_set == DIRTY_BITS_VALUE) { num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3); } @@ -251,6 +305,7 @@ bloom_filter_alloc bloom_filter_alloc::internal_deserialize_or_wrap(void* template void bloom_filter_alloc::serialize(std::ostream& os) const { + // Should we serialize memory_ directly if it exists? const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD; write(os, preamble_longs); const uint8_t serial_version = SER_VER; @@ -276,6 +331,7 @@ void bloom_filter_alloc::serialize(std::ostream& os) const { template auto bloom_filter_alloc::serialize(unsigned header_size_bytes) const -> vector_bytes { + // Should we serialize memory_ directly if it exists? const size_t size = header_size_bytes + get_serialized_size_bytes(); vector_bytes bytes(size, 0, allocator_); uint8_t* ptr = bytes.data() + header_size_bytes; @@ -309,11 +365,11 @@ size_t bloom_filter_alloc::get_serialized_size_bytes() const { } template -size_t bloom_filter_alloc::get_serialized_size_bytes(uint64_t num_bits) { +size_t bloom_filter_alloc::get_serialized_size_bytes(const uint64_t num_bits) { if (num_bits == 0) throw std::invalid_argument("Number of bits must be greater than zero"); - size_t num_bytes = ((num_bits + 63) >> 6) << 6; + size_t num_bytes = (num_bits + 63) >> 6; return sizeof(uint64_t) * (PREAMBLE_LONGS_STANDARD + num_bytes); } @@ -356,6 +412,11 @@ bool bloom_filter_alloc::has_backing_memory() const { return memory_ != nullptr; } +template +const uint8_t* bloom_filter_alloc::get_backing_memory() const { + return memory_; +} + template void bloom_filter_alloc::reset() { if (is_read_only_) { diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index ce50348b..2b88ffd1 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -47,44 +47,99 @@ TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") { uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64 REQUIRE(bf.get_capacity() == adjusted_num_bits); REQUIRE(bf.get_num_hashes() == num_hashes); + REQUIRE(bf.get_seed() == seed); REQUIRE(bf.is_empty()); + + // should match above + bf = bloom_filter_builder::create_by_accuracy(num_items, fpp, seed); + REQUIRE(bf.get_capacity() == adjusted_num_bits); + REQUIRE(bf.get_num_hashes() == num_hashes); + REQUIRE(bf.get_seed() == seed); + REQUIRE(bf.is_empty()); + + // same for initializing memory in-place + size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits); + uint8_t* bytes = new uint8_t[serialized_size_bytes]; + + bf = bloom_filter_builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed); + REQUIRE(bf.get_capacity() == adjusted_num_bits); + REQUIRE(bf.get_num_hashes() == num_hashes); + REQUIRE(bf.get_seed() == seed); + REQUIRE(bf.is_empty()); + + bf = bloom_filter_builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed); + REQUIRE(bf.get_capacity() == adjusted_num_bits); + REQUIRE(bf.get_num_hashes() == num_hashes); + REQUIRE(bf.get_seed() == seed); + REQUIRE(bf.is_empty()); + + delete [] bytes; } TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") { - uint64_t num_bits = 8192; - uint16_t num_hashes = 3; + uint64_t num_items = 5000; + double fpp = 0.01; - auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes); + auto bf = bloom_filter_builder::create_by_accuracy(num_items, fpp); REQUIRE(bf.is_empty()); - REQUIRE(bf.get_capacity() == num_bits); // num_bits is multiple of 64 so should be exact - REQUIRE(bf.get_num_hashes() == num_hashes); REQUIRE(bf.get_bits_used() == 0); - uint64_t n = 1000; - for (uint64_t i = 0; i < n; ++i) { + for (uint64_t i = 0; i < num_items; ++i) { bf.query_and_update(i); } REQUIRE(!bf.is_empty()); - // these assume the filter isn't too close to capacity - REQUIRE(bf.get_bits_used() <= n * num_hashes); - REQUIRE(bf.get_bits_used() >= n * (num_hashes - 1)); + // filter is about 50% full at target capacity + REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); uint32_t num_found = 0; - for (uint64_t i = 0; i < n; ++i) { + for (uint64_t i = num_items; i < bf.get_capacity(); ++i) { if (bf.query(i)) { ++num_found; } } - REQUIRE(num_found >= n); - REQUIRE(num_found < 1.1 * n); + // fpp is average with significant variance + REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12)); + auto bytes = bf.serialize(); + + // initialize in memory and run the same tests + // also checking against the results from the first part + uint8_t* bf_memory = new uint8_t[bytes.size()]; + auto bf2 = bloom_filter_builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed()); + REQUIRE(bf2.is_empty()); + REQUIRE(bf2.get_bits_used() == 0); + + for (uint64_t i = 0; i < num_items; ++i) { + bf2.query_and_update(i); + } + + REQUIRE(!bf2.is_empty()); + REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above + uint32_t num_found2 = 0; + for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) { + if (bf2.query(i)) { + ++num_found2; + } + } + REQUIRE(num_found == num_found2); // should exactly match above + auto bytes2 = bf2.serialize(); + + // ensure the filters reset properly bf.reset(); - // repeat initial tests from above REQUIRE(bf.is_empty()); - REQUIRE(bf.get_capacity() == num_bits); - REQUIRE(bf.get_num_hashes() == num_hashes); REQUIRE(bf.get_bits_used() == 0); + + bf2.reset(); + REQUIRE(bf2.is_empty()); + REQUIRE(bf2.get_bits_used() == 0); + + REQUIRE(bytes.size() == bytes2.size()); + for (size_t i = 0; i < bytes.size(); ++i) { + REQUIRE(bytes[i] == bytes2[i]); + } + + delete [] bf_memory; } TEST_CASE("bloom_filter: inversion", "[bloom_filter]") { From 1a44c449a3db0608b08a0f95d206e450b583ff32 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Mon, 12 Aug 2024 18:01:54 -0700 Subject: [PATCH 14/30] test using raw memory from sketch vs serialization --- filters/test/bloom_filter_test.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index 2b88ffd1..bce29411 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -125,6 +125,18 @@ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") { REQUIRE(num_found == num_found2); // should exactly match above auto bytes2 = bf2.serialize(); + REQUIRE(bytes.size() == bytes2.size()); + for (size_t i = 0; i < bytes.size(); ++i) { + REQUIRE(bytes[i] == bytes2[i]); + } + + // check that raw memory also matches serialized sketch + const uint8_t* bf_bytes = bf2.get_backing_memory(); + REQUIRE(bf_bytes == bf_memory); + for (size_t i = 0; i < bytes.size(); ++i) { + REQUIRE(bf_bytes[i] == bytes[i]); + } + // ensure the filters reset properly bf.reset(); REQUIRE(bf.is_empty()); @@ -134,11 +146,6 @@ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") { REQUIRE(bf2.is_empty()); REQUIRE(bf2.get_bits_used() == 0); - REQUIRE(bytes.size() == bytes2.size()); - for (size_t i = 0; i < bytes.size(); ++i) { - REQUIRE(bytes[i] == bytes2[i]); - } - delete [] bf_memory; } From 65bd15f294210a352fc33fd11dd73bc3191d8690 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Mon, 12 Aug 2024 18:50:48 -0700 Subject: [PATCH 15/30] functionally complete, still need a bit more documentation --- filters/include/bloom_filter.hpp | 8 +++- filters/include/bloom_filter_impl.hpp | 13 +++++-- filters/test/bloom_filter_test.cpp | 56 +++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 6 deletions(-) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index d2fca005..a7865681 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -464,8 +464,6 @@ class bloom_filter_alloc { */ bool is_compatible(const bloom_filter_alloc& other) const; - // TODO: Serialization - /** * @brief Checks if the Bloom Filter is read-only. * @@ -473,6 +471,12 @@ class bloom_filter_alloc { */ bool is_read_only() const; + /** + * @brief Returns whether the filter owns its underlying memory + * @return True if the filter owns its memory, otherwise false + */ + bool is_memory_owned() const; + /** * @brief Checks if the Bloom Filter has backing memory. * diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 3c33ea7c..7baee8f0 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -214,13 +214,13 @@ bloom_filter_alloc bloom_filter_alloc::deserialize(std::istream& is, const read(is, bit_array, num_bytes); // pass to constructor - return bloom_filter_alloc(seed, num_hashes, is_dirty, false, false, num_longs << 6, num_bits_set, bit_array, nullptr, allocator); + return bloom_filter_alloc(seed, num_hashes, is_dirty, true, false, num_longs << 6, num_bits_set, bit_array, nullptr, allocator); } template const bloom_filter_alloc bloom_filter_alloc::wrap(const void* bytes, size_t length_bytes, const A& allocator) { // read-only flag means we won't modify the memory, but cast away the const - return const_cast>(internal_deserialize_or_wrap(const_cast(bytes), length_bytes, true, true, allocator)); + return internal_deserialize_or_wrap(const_cast(bytes), length_bytes, true, true, allocator); } template @@ -299,8 +299,8 @@ bloom_filter_alloc bloom_filter_alloc::internal_deserialize_or_wrap(void* copy_from_mem(ptr, bit_array, num_bytes); } - // pass to constructor - return bloom_filter_alloc(seed, num_hashes, is_dirty, wrap, read_only, num_longs << 6, num_bits_set, bit_array, memory, allocator); + // pass to constructor -- !wrap == is_owned_ + return bloom_filter_alloc(seed, num_hashes, is_dirty, !wrap, read_only, num_longs << 6, num_bits_set, bit_array, memory, allocator); } template @@ -412,6 +412,11 @@ bool bloom_filter_alloc::has_backing_memory() const { return memory_ != nullptr; } +template +bool bloom_filter_alloc::is_memory_owned() const { + return is_owned_; +} + template const uint8_t* bloom_filter_alloc::get_backing_memory() const { return memory_; diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index bce29411..16b2019d 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -285,6 +285,16 @@ TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") { REQUIRE(bf.get_seed() == bf_stream.get_seed()); REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes()); REQUIRE(bf_stream.is_empty()); + + // read-only wrap should work + auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size()); + REQUIRE(bf.get_capacity() == bf_wrap.get_capacity()); + REQUIRE(bf.get_seed() == bf_wrap.get_seed()); + REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes()); + REQUIRE(bf_wrap.is_empty()); + + // writable wrap should not + REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument); } TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { @@ -313,6 +323,7 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { REQUIRE(bf.get_seed() == bf_bytes.get_seed()); REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes()); REQUIRE(!bf_bytes.is_empty()); + REQUIRE(bf.is_memory_owned()); uint64_t fp_count_bytes = 0; for (uint64_t i = 0; i < num_bits; ++i) { bool val = bf_bytes.query(0.5 + i); @@ -330,6 +341,7 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { REQUIRE(bf.get_seed() == bf_stream.get_seed()); REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes()); REQUIRE(!bf_stream.is_empty()); + REQUIRE(bf_stream.is_memory_owned()); uint64_t fp_count_stream = 0; for (uint64_t i = 0; i < num_bits; ++i) { bool val = bf_stream.query(0.5 + i); @@ -339,6 +351,50 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { ++fp_count_stream; } REQUIRE(fp_count_stream == fp_count); + + // read-only wrap + auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size()); + REQUIRE(bf.get_capacity() == bf_wrap.get_capacity()); + REQUIRE(bf.get_seed() == bf_wrap.get_seed()); + REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes()); + REQUIRE(!bf_wrap.is_empty()); + REQUIRE(!bf_wrap.is_memory_owned()); + uint64_t fp_count_wrap = 0; + for (uint64_t i = 0; i < num_bits; ++i) { + bool val = bf_wrap.query(0.5 + i); + if (i < n) + REQUIRE(val); + else if (val) + ++fp_count_wrap; + } + REQUIRE(fp_count_wrap == fp_count); + REQUIRE_THROWS_AS(bf_wrap.update(0.5), std::logic_error); + REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error); + + // writable wrap + auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size()); + REQUIRE(bf.get_capacity() == bf_writable.get_capacity()); + REQUIRE(bf.get_seed() == bf_writable.get_seed()); + REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes()); + REQUIRE(!bf_writable.is_empty()); + REQUIRE(!bf_writable.is_memory_owned()); + uint64_t fp_count_writable = 0; + for (uint64_t i = 0; i < num_bits; ++i) { + bool val = bf_writable.query(0.5 + i); + if (i < n) + REQUIRE(val); + else if (val) + ++fp_count_writable; + } + REQUIRE(fp_count_writable == fp_count); + + REQUIRE(!bf_writable.query(-1.0)); + bf_writable.update(-1.0); + REQUIRE(bf_writable.query(-1.0)); + + // not good memory management to do this, but because we wrapped the same bytes as both + // read-only adn writable, that update should ahve changed the read-only version, too + REQUIRE(bf_wrap.query(-1.0)); } } // namespace datasketches From be58ab66ed6d57441b9373599942adf427e46aa0 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:47:42 -0700 Subject: [PATCH 16/30] finish documenting, add copy/move ctors and assignment operators --- filters/include/bloom_filter.hpp | 170 ++++++++++++++++-- filters/include/bloom_filter_builder_impl.hpp | 54 ++++-- filters/include/bloom_filter_impl.hpp | 79 +++++++- filters/test/bloom_filter_test.cpp | 3 +- 4 files changed, 276 insertions(+), 30 deletions(-) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index a7865681..adfc6231 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -42,27 +42,94 @@ class bloom_filter_builder_alloc { using A = Allocator; public: - static uint16_t suggest_num_hashes(const uint64_t num_distinct_items, const uint64_t num_filter_bits); + /** + * Returns the optimal number of hash functions to given target numbers of distinct items + * and the Bloom filter size in bits. This function will provide a result even if the input + * values exceed the capacity of a single Bloom filter. + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param num_filter_bits The intended size of the Bloom Filter in bits + * @return The suggested number of hash functions to use with the filter + */ + static uint16_t suggest_num_hashes(const uint64_t max_distinct_items, const uint64_t num_filter_bits); + + /** + * Returns the optimal number of hash functions to achieve a target false positive probability. + * @param target_false_positive_prob A desired false positive probability per item + * @return The suggested number of hash functions to use with the filter. + */ static uint16_t suggest_num_hashes(const double target_false_positive_prob); - static uint64_t suggest_num_filter_bits(const uint64_t num_distinct_items, const double target_false_positive_prob); - static bloom_filter_alloc create_by_accuracy(const uint64_t num_distinct_items, + /** + * Returns the optimal number of bits to use in a Bloom filter given a target number of distinct + * items and a target false positive probability. + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param target_false_positive_prob A desired false positive probability per item + * @return The suggested number of bits to use with the filter + */ + static uint64_t suggest_num_filter_bits(const uint64_t max_distinct_items, const double target_false_positive_prob); + + /** + * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, + * using a random base seed for the hash function. + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param target_false_positive_prob A desired false positive probability per item + * @param seed A bash hash seed (default: random) + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new Bloom filter configured for the given input parameters + */ + static bloom_filter_alloc create_by_accuracy(const uint64_t max_distinct_items, const double target_false_positive_prob, const uint64_t seed = generate_random_seed(), const Allocator& allocator = Allocator()); + /** + * Creates a Bloom filter with given number of bits and number of hash functions, + * using the provided base seed for the hash function. + * + * @param num_bits The size of the BloomFilter, in bits + * @param num_hashes The number of hash functions to apply to items + * @param seed A base hash seed (default: random) + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new Bloom filter configured for the given input parameters + */ static bloom_filter_alloc create_by_size(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed = generate_random_seed(), const Allocator& allocator = Allocator()); + /** + * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, + * using a random base seed for the hash function and writing into the provided memory. The filter does + * not take ownership of the memory but does overwrite the full contents. + * + * @param memory A pointer to the memory to use for the filter + * @param length_bytes The length of the memory in bytes + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param target_false_positive_prob A desired false positive probability per item + * @param dstMem A WritableMemory to hold the initialized filter + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new Bloom filter configured for the given input parameters in the provided memory + */ static bloom_filter_alloc initialize_by_accuracy(void* memory, const size_t length_bytes, - const uint64_t num_distinct_items, + const uint64_t max_distinct_items, const double target_false_positive_prob, const uint64_t seed = generate_random_seed(), const Allocator& allocator = Allocator()); + /** + * Initializes a Bloom filter with given number of bits and number of hash functions, + * using the provided base seed for the hash function and writing into the provided memory. The filter does + * not take ownership of the memory but does overwrite the full contents. + * + * @param memory A pointer to the memory to use for the filter + * @param length_bytes The length of the memory in bytes + * @param num_bits The size of the BloomFilter, in bits + * @param num_hashes The number of hash functions to apply to items + * @param seed A base hash seed (default: random) + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new BloomFilter configured for the given input parameters + */ static bloom_filter_alloc initialize_by_size(void* memory, const size_t length_bytes, const uint64_t num_bits, @@ -76,6 +143,10 @@ class bloom_filter_builder_alloc { * @return uint64_t a random value over the range of unsigned 64-bit integers */ static uint64_t generate_random_seed(); + +private: + static void validate_size_inputs(uint64_t num_bits, uint16_t num_hashes); + static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob); }; template> @@ -84,19 +155,95 @@ class bloom_filter_alloc { public: - using vector_bytes = std::vector::template rebind_alloc>; - vector_bytes serialize(unsigned header_size_bytes = 0) const; - - void serialize(std::ostream& os) const; - + /** + * This method deserializes a Bloom filter from a given array of bytes. + * @param bytes pointer to the array of bytes + * @param size the size of the array + * @param allocator instance of an Allocator + * @return an instance of a Bloom filter + */ static bloom_filter_alloc deserialize(const void* bytes, size_t length_bytes, const Allocator& allocator = Allocator()); + /** + * This method deserializes a Bloom filter from a given stream. + * @param is input stream + * @param allocator instance of an Allocator + * @return an instance of a Bloom filter + */ static bloom_filter_alloc deserialize(std::istream& is, const A& allocator = Allocator()); + /** + * @brief Wraps the provided memory as a read-only Bloom filter. Reads the data in-place and does + * not take ownership of the underlying memory. Does not allow modifying the filter. + * + * @param data The memory to wrap + * @param length_bytes The length of the memory in bytes + * @param allocator instance of an Allocator + * @return a const (read-only) Bloom filter wrapping the provided memory + */ static const bloom_filter_alloc wrap(const void* data, size_t length_bytes, const Allocator& allocator = Allocator()); + /** + * @brief Wraps the provided memory as a writable Bloom filter. Reads the data in-place and does + * not take ownership of the underlying memory. Allows modifying the filter. + * + * @param data the memory to wrap + * @param length_bytes the length of the memory in bytes + * @param allocator instance of an Allocator + * @return a Bloom filter wrapping the provided memory + */ static bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, const Allocator& allocator = Allocator()); + /** + * Copy constructor + * @param other filter to be copied + */ + bloom_filter_alloc(const bloom_filter_alloc&); + + /** Move constructor + * @param other filter to be moved + */ + bloom_filter_alloc(bloom_filter_alloc&&) noexcept; + + /** + * Copy assignment + * @param other filter to be copied + * @return reference to this filter + */ + bloom_filter_alloc& operator=(const bloom_filter_alloc& other); + + /** + * Move assignment + * @param other filter to be moved + * @return reference to this filter + */ + bloom_filter_alloc& operator=(bloom_filter_alloc&& other); + + /** + * @brief Destroy the bloom filter alloc object + */ + ~bloom_filter_alloc(); + + // This is a convenience alias for users + // The type returned by the following serialize method + using vector_bytes = std::vector::template rebind_alloc>; + + /** + * This method serializes the filter as a vector of bytes. + * An optional header can be reserved in front of the filter. + * It is a blank space of a given size. + * This header is used in Datasketches PostgreSQL extension. + * @param header_size_bytes space to reserve in front of the filter + * @return serialized filter as a vector of bytes + */ + vector_bytes serialize(unsigned header_size_bytes = 0) const; + + /** + * This method serializes the filter into a given stream in a binary form + * @param os output stream + */ + void serialize(std::ostream& os) const; + /** * Checks if the Bloom Filter has processed any items * @return True if the BloomFilter is empty, otherwise False @@ -510,11 +657,6 @@ class bloom_filter_alloc { */ string to_string(bool print_filter = false) const; - /** - * @brief Destroy the bloom filter alloc object - */ - ~bloom_filter_alloc(); - private: static const uint64_t DIRTY_BITS_VALUE = static_cast(-1LL); static const uint64_t MAX_HEADER_SIZE_BYTES = 32; // 4 Java Longs diff --git a/filters/include/bloom_filter_builder_impl.hpp b/filters/include/bloom_filter_builder_impl.hpp index 6c16bb29..5b34767a 100644 --- a/filters/include/bloom_filter_builder_impl.hpp +++ b/filters/include/bloom_filter_builder_impl.hpp @@ -40,31 +40,39 @@ uint64_t bloom_filter_builder_alloc::generate_random_seed() { } template -uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const uint64_t num_distinct_items, +uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const uint64_t max_distinct_items, const uint64_t num_filter_bits) { - // TODO: validate inputs > 0 - return static_cast(std::ceil(static_cast(num_filter_bits) / num_distinct_items * log(2.0))); + if (max_distinct_items == 0) { + throw std::invalid_argument("maximum number of distinct items must be strictly positive"); + } + if (num_filter_bits == 0) { + throw std::invalid_argument("number of bits in the filter must be strictly positive"); + } else if (num_filter_bits > bloom_filter_alloc::MAX_FILTER_SIZE_BITS) { + throw std::invalid_argument("number of bits in the filter must be less than 2^63"); + } + return static_cast(std::ceil(static_cast(num_filter_bits) / max_distinct_items * log(2.0))); } template uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const double target_false_positive_prob) { - // TODO: validate inputs + validate_accuracy_inputs(100, target_false_positive_prob); // max_distinct_items is an arbitrary valid value return static_cast(std::ceil(-log(target_false_positive_prob) / log(2.0))); } template uint64_t bloom_filter_builder_alloc::suggest_num_filter_bits(const uint64_t max_distinct_items, const double target_false_positive_prob) { - // TODO: validate inputs + validate_accuracy_inputs(max_distinct_items, target_false_positive_prob); return static_cast(std::ceil(-static_cast(max_distinct_items) * log(target_false_positive_prob) / (log(2.0) * log(2.0)))); } template -bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(const uint64_t num_distinct_items, +bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(const uint64_t max_distinct_items, const double target_false_positive_prob, const uint64_t seed, const A& allocator) { - const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(num_distinct_items, target_false_positive_prob); + validate_accuracy_inputs(max_distinct_items, target_false_positive_prob); + const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob); const uint16_t num_hashes = bloom_filter_builder_alloc::suggest_num_hashes(target_false_positive_prob); return bloom_filter_alloc(num_filter_bits, num_hashes, seed, allocator); } @@ -74,19 +82,19 @@ bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(const uint64 const uint16_t num_hashes, const uint64_t seed, const A& allocator) { - // TODO: validate inputs + validate_size_inputs(num_bits, num_hashes); return bloom_filter_alloc(num_bits, num_hashes, seed, allocator); } template bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_accuracy(void* memory, const size_t length_bytes, - const uint64_t num_distinct_items, + const uint64_t max_distinct_items, const double target_false_positive_prob, const uint64_t seed, const A& allocator) { - // TODO: validate inputs - const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(num_distinct_items, target_false_positive_prob); + validate_accuracy_inputs(max_distinct_items, target_false_positive_prob); + const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob); const uint16_t num_hashes = bloom_filter_builder_alloc::suggest_num_hashes(target_false_positive_prob); return bloom_filter_alloc(static_cast(memory), length_bytes, num_filter_bits, num_hashes, seed, allocator); } @@ -98,10 +106,32 @@ bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_size(void* me const uint16_t num_hashes, const uint64_t seed, const A& allocator) { - // TODO: validate inputs + validate_size_inputs(num_bits, num_hashes); return bloom_filter_alloc(static_cast(memory), length_bytes, num_bits, num_hashes, seed, allocator); } +template +void bloom_filter_builder_alloc::validate_size_inputs(uint64_t num_bits, uint16_t num_hashes) { + if (num_bits == 0) { + throw std::invalid_argument("number of bits in the filter must be strictly positive"); + } else if (num_bits > bloom_filter_alloc::MAX_FILTER_SIZE_BITS) { + throw std::invalid_argument("number of bits in the filter must be less than 2^63"); + } + if (num_hashes == 0) { + throw std::invalid_argument("number of hashes for the filter must be strictly positive"); + } +} + +template +void bloom_filter_builder_alloc::validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob) { + if (max_distinct_items == 0) { + throw std::invalid_argument("maximum number of distinct items must be strictly positive"); + } + if (target_false_positive_prob <= 0.0 || target_false_positive_prob > 1.0) { + throw std::invalid_argument("target false positive probability must be a valid probability strictly greater than 0.0"); + } +} + } // namespace datasketches #endif // _BLOOM_FILTER_BUILDER_IMPL_HPP_ \ No newline at end of file diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 7baee8f0..9f635699 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -147,6 +147,82 @@ bloom_filter_alloc::bloom_filter_alloc(const uint64_t seed, } } +template +bloom_filter_alloc::bloom_filter_alloc(const bloom_filter_alloc& other) : + allocator_(other.allocator_), + seed_(other.seed_), + num_hashes_(other.num_hashes_), + is_dirty_(other.is_dirty_), + is_owned_(other.is_owned_), + is_read_only_(other.is_read_only_), + capacity_bits_(other.capacity_bits_), + num_bits_set_(other.num_bits_set_) +{ + if (is_owned_) { + const size_t num_bytes = capacity_bits_ >> 3; + bit_array_ = allocator_.allocate(num_bytes); + if (bit_array_ == nullptr) { + throw std::bad_alloc(); + } + std::copy_n(other.bit_array_, num_bytes, bit_array_); + memory_ = nullptr; + } else { + bit_array_ = other.bit_array_; + memory_ = other.memory_; + } +} + +template +bloom_filter_alloc::bloom_filter_alloc(bloom_filter_alloc&& other) noexcept : + allocator_(std::move(other.allocator_)), + seed_(other.seed_), + num_hashes_(other.num_hashes_), + is_dirty_(other.is_dirty_), + is_owned_(other.is_owned_), + is_read_only_(other.is_read_only_), + capacity_bits_(other.capacity_bits_), + num_bits_set_(other.num_bits_set_), + bit_array_(std::move(other.bit_array_)), + memory_(std::move(other.memory_)) +{ + // ensure destructor on other will behave nicely + other.is_owned_ = false; + other.bit_array_ = nullptr; + other.memory_ = nullptr; +} + +template +bloom_filter_alloc& bloom_filter_alloc::operator=(const bloom_filter_alloc& other) { + bloom_filter_alloc copy(other); + std::swap(allocator_, copy.allocator_); + std::swap(seed_, copy.seed_); + std::swap(num_hashes_, copy.num_hashes_); + std::swap(is_dirty_, copy.is_dirty_); + std::swap(is_owned_, copy.is_owned_); + std::swap(is_read_only_, copy.is_read_only_); + std::swap(capacity_bits_, copy.capacity_bits_); + std::swap(num_bits_set_, copy.num_bits_set_); + std::swap(bit_array_, copy.bit_array_); + std::swap(memory_, copy.memory_); + return *this; +} + +template +bloom_filter_alloc& bloom_filter_alloc::operator=(bloom_filter_alloc&& other) { + if (this == &other) { return *this; } + std::swap(allocator_, other.allocator_); + std::swap(seed_, other.seed_); + std::swap(num_hashes_, other.num_hashes_); + std::swap(is_dirty_, other.is_dirty_); + std::swap(is_owned_, other.is_owned_); + std::swap(is_read_only_, other.is_read_only_); + std::swap(capacity_bits_, other.capacity_bits_); + std::swap(num_bits_set_, other.num_bits_set_); + std::swap(bit_array_, other.bit_array_); + std::swap(memory_, other.memory_); + return *this; +} + template bloom_filter_alloc::~bloom_filter_alloc() { if (is_owned_) { @@ -162,9 +238,6 @@ bloom_filter_alloc::~bloom_filter_alloc() { } } -// TODO: copy, move constructors -// TODO: copy, move assignment operators - template bloom_filter_alloc bloom_filter_alloc::deserialize(const void* bytes, size_t length_bytes, const A& allocator) { // not wrapping so we can cast away const as we're not modifying the memory diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index 16b2019d..df22cadf 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -368,7 +368,8 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { ++fp_count_wrap; } REQUIRE(fp_count_wrap == fp_count); - REQUIRE_THROWS_AS(bf_wrap.update(0.5), std::logic_error); + REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error); + REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error); REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error); // writable wrap From 2c70bbed1444a3dea27f60e69f8daf2420be9501 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:33:28 -0700 Subject: [PATCH 17/30] fix allocator usage, add test_allocator test --- filters/include/bloom_filter.hpp | 2 ++ filters/include/bloom_filter_impl.hpp | 12 ++++++------ filters/test/CMakeLists.txt | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index adfc6231..e73170d0 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -658,6 +658,8 @@ class bloom_filter_alloc { string to_string(bool print_filter = false) const; private: + using AllocUint8 = typename std::allocator_traits::template rebind_alloc; + static const uint64_t DIRTY_BITS_VALUE = static_cast(-1LL); static const uint64_t MAX_HEADER_SIZE_BYTES = 32; // 4 Java Longs static const uint64_t BIT_ARRAY_LENGTH_OFFSET_BYTES = 16; diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 9f635699..fab4be7b 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -57,7 +57,7 @@ bloom_filter_alloc::bloom_filter_alloc(const uint64_t num_bits, const uint16_ } const uint64_t num_bytes = capacity_bits_ >> 3; - bit_array_ = allocator_.allocate(num_bytes); + bit_array_ = AllocUint8(allocator_).allocate(num_bytes); std::fill_n(bit_array_, num_bytes, 0); if (bit_array_ == nullptr) { throw std::bad_alloc(); @@ -160,7 +160,7 @@ bloom_filter_alloc::bloom_filter_alloc(const bloom_filter_alloc& other) : { if (is_owned_) { const size_t num_bytes = capacity_bits_ >> 3; - bit_array_ = allocator_.allocate(num_bytes); + bit_array_ = AllocUint8(allocator_).allocate(num_bytes); if (bit_array_ == nullptr) { throw std::bad_alloc(); } @@ -228,10 +228,10 @@ bloom_filter_alloc::~bloom_filter_alloc() { if (is_owned_) { if (memory_ != nullptr) { // deallocate total memory_ block, including preamble - allocator_.deallocate(memory_, (capacity_bits_ >> 3) + BIT_ARRAY_OFFSET_BYTES); + AllocUint8(allocator_).deallocate(memory_, (capacity_bits_ >> 3) + BIT_ARRAY_OFFSET_BYTES); } else if (bit_array_ != nullptr) { // only need to deallocate bit_array_ - allocator_.deallocate(bit_array_, capacity_bits_ >> 3); + AllocUint8(allocator_).deallocate(bit_array_, capacity_bits_ >> 3); } memory_ = nullptr; bit_array_ = nullptr; @@ -279,7 +279,7 @@ bloom_filter_alloc bloom_filter_alloc::deserialize(std::istream& is, const // allocate memory const uint64_t num_bytes = num_longs << 3; - A alloc(allocator); + AllocUint8 alloc(allocator); uint8_t* bit_array = alloc.allocate(num_bytes); if (bit_array == nullptr) { throw std::bad_alloc(); @@ -364,7 +364,7 @@ bloom_filter_alloc bloom_filter_alloc::internal_deserialize_or_wrap(void* memory = nullptr; const uint64_t num_bytes = num_longs << 3; ensure_minimum_memory(end_ptr - ptr, num_bytes); - A alloc(allocator); + AllocUint8 alloc(allocator); bit_array = alloc.allocate(num_bytes); if (bit_array == nullptr) { throw std::bad_alloc(); diff --git a/filters/test/CMakeLists.txt b/filters/test/CMakeLists.txt index 67615d0f..67e24dda 100644 --- a/filters/test/CMakeLists.txt +++ b/filters/test/CMakeLists.txt @@ -42,4 +42,5 @@ target_sources(bloom_filter_test PRIVATE bit_array_ops_test.cpp bloom_filter_test.cpp + bloom_filter_allocation_test.cpp ) From 66e64f22a2416a860d6f57f36b00631b80e8f38b Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:35:17 -0700 Subject: [PATCH 18/30] actually add test_allocator test --- filters/test/bloom_filter_allocation_test.cpp | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 filters/test/bloom_filter_allocation_test.cpp diff --git a/filters/test/bloom_filter_allocation_test.cpp b/filters/test/bloom_filter_allocation_test.cpp new file mode 100644 index 00000000..c95f56f8 --- /dev/null +++ b/filters/test/bloom_filter_allocation_test.cpp @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include + +#include "bloom_filter.hpp" +#include "test_type.hpp" +#include "test_allocator.hpp" + +namespace datasketches { + +using bloom_filter_builder_test_alloc = bloom_filter_builder_alloc>; +using bloom_filter_test_alloc = bloom_filter_alloc>; +using alloc = test_allocator; + +TEST_CASE("bloom filter allocation test", "[bloom_filter][test_type]") { + test_allocator_total_bytes = 0; + test_allocator_net_allocations = 0; + { + int64_t num_items = 10000; + double fpp = 0.01; + uint64_t seed = bloom_filter_builder_test_alloc::generate_random_seed(); + auto bf1 = bloom_filter_builder_test_alloc::create_by_accuracy(num_items, + fpp, + seed, + alloc(0)); + for (int i = 0; i < num_items; ++i) { + if (num_items % 1 == 0) { + bf1.update(std::to_string(i)); + } else { + bf1.update(i); + } + } + auto bytes1 = bf1.serialize(0); + auto bf2 = bloom_filter_test_alloc::deserialize(bytes1.data(), bytes1.size(), 0); + + std::stringstream ss; + bf1.serialize(ss); + auto bf3 = bloom_filter_test_alloc::deserialize(ss, alloc(0)); + + bf3.reset(); + for (int i = 0; i < num_items; ++i) { + bf1.update(-1.0 * i); + } + + bf3.union_with(bf1); + + auto bytes2 = bf3.serialize(0); + auto bf4 = bloom_filter_test_alloc::deserialize(bytes2.data(), bytes2.size(), 0); + + auto bf5 = bloom_filter_test_alloc::wrap(bytes2.data(), bytes2.size(), 0); + auto bf6 = bloom_filter_test_alloc::writable_wrap(bytes2.data(), bytes2.size(), 0); + } + REQUIRE(test_allocator_total_bytes == 0); + REQUIRE(test_allocator_net_allocations == 0); +} + +} From 321f01de2e036a65830b7096dc5e205b180d310f Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:46:37 -0700 Subject: [PATCH 19/30] Add java serde compatibility tests --- filters/test/CMakeLists.txt | 14 +++++ ...loom_filter_deserialize_from_java_test.cpp | 51 +++++++++++++++++++ .../test/bloom_filter_serialize_for_java.cpp | 44 ++++++++++++++++ 3 files changed, 109 insertions(+) create mode 100644 filters/test/bloom_filter_deserialize_from_java_test.cpp create mode 100644 filters/test/bloom_filter_serialize_for_java.cpp diff --git a/filters/test/CMakeLists.txt b/filters/test/CMakeLists.txt index 67e24dda..126fa922 100644 --- a/filters/test/CMakeLists.txt +++ b/filters/test/CMakeLists.txt @@ -44,3 +44,17 @@ target_sources(bloom_filter_test bloom_filter_test.cpp bloom_filter_allocation_test.cpp ) + +if (SERDE_COMPAT) +target_sources(bloom_filter_test + PRIVATE + bloom_filter_deserialize_from_java_test.cpp +) +endif() + +if (GENERATE) +target_sources(bloom_filter_test + PRIVATE + bloom_filter_serialize_for_java.cpp +) +endif() diff --git a/filters/test/bloom_filter_deserialize_from_java_test.cpp b/filters/test/bloom_filter_deserialize_from_java_test.cpp new file mode 100644 index 00000000..81017e59 --- /dev/null +++ b/filters/test/bloom_filter_deserialize_from_java_test.cpp @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include "bloom_filter.hpp" + +namespace datasketches { + +// assume the binary sketches for this test have been generated by datasketches-java code +// in the subdirectory called "java" in the root directory of this project +static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; + +TEST_CASE("bloom_filter", "[serde_compat]") { + const uint64_t n_arr[] = {0, 10000, 2000000, 30000000}; + const double h_arr[] = {3, 5}; + for (const uint64_t n: n_arr) { + for (const uint16_t num_hashes: h_arr) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.filter", std::ios::binary); + is.open(path, std::ios::binary); + auto bf = bloom_filter::deserialize(is); + REQUIRE(bf.is_empty() == (n == 0)); + REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10))); + + for (uint64_t i = 0; i < n / 10; ++i) { + REQUIRE(bf.query(i)); + } + } + } +} + +} /* namespace datasketches */ diff --git a/filters/test/bloom_filter_serialize_for_java.cpp b/filters/test/bloom_filter_serialize_for_java.cpp new file mode 100644 index 00000000..00293886 --- /dev/null +++ b/filters/test/bloom_filter_serialize_for_java.cpp @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "bloom_filter.hpp" + +namespace datasketches { + +TEST_CASE("bloom filter generate", "[serialize_for_java]") { + const uint64_t n_arr[] = {0, 10000, 2000000, 30000000}; + const double h_arr[] = {3, 5}; + for (const uint64_t n: n_arr) { + for (const uint16_t num_hashes: h_arr) { + const uint64_t config_bits = std::max(n, static_cast(1000)); // so empty still has valid bit size + bloom_filter bf = bloom_filter_builder::create_by_size(config_bits, num_hashes); + for (uint64_t i = 0; i < n / 10; ++i) bf.update(i); // note: n / 10 items into n bits + REQUIRE(bf.is_empty() == (n == 0)); + REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10))); + std::ofstream os("bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.filter", std::ios::binary); + bf.serialize(os); + } + } +} + +} /* namespace datasketches */ From 6daa489650ff0fc876ed161856a791f7c06b42a2 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:52:20 -0700 Subject: [PATCH 20/30] remove debug line that was no longer valid --- filters/test/bloom_filter_deserialize_from_java_test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/filters/test/bloom_filter_deserialize_from_java_test.cpp b/filters/test/bloom_filter_deserialize_from_java_test.cpp index 81017e59..c8e11f83 100644 --- a/filters/test/bloom_filter_deserialize_from_java_test.cpp +++ b/filters/test/bloom_filter_deserialize_from_java_test.cpp @@ -36,7 +36,6 @@ TEST_CASE("bloom_filter", "[serde_compat]") { std::ifstream is; is.exceptions(std::ios::failbit | std::ios::badbit); is.open(testBinaryInputPath + "bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.filter", std::ios::binary); - is.open(path, std::ios::binary); auto bf = bloom_filter::deserialize(is); REQUIRE(bf.is_empty() == (n == 0)); REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10))); From 505a11946d2b18d5070ea58a37afa0b693bd9d43 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:02:25 -0700 Subject: [PATCH 21/30] Include NaN in language compatibility tests --- filters/test/bloom_filter_deserialize_from_java_test.cpp | 1 + filters/test/bloom_filter_serialize_for_java.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/filters/test/bloom_filter_deserialize_from_java_test.cpp b/filters/test/bloom_filter_deserialize_from_java_test.cpp index c8e11f83..255d1f78 100644 --- a/filters/test/bloom_filter_deserialize_from_java_test.cpp +++ b/filters/test/bloom_filter_deserialize_from_java_test.cpp @@ -43,6 +43,7 @@ TEST_CASE("bloom_filter", "[serde_compat]") { for (uint64_t i = 0; i < n / 10; ++i) { REQUIRE(bf.query(i)); } + if (n > 0) REQUIRE(bf.query(std::nan("1"))); } } } diff --git a/filters/test/bloom_filter_serialize_for_java.cpp b/filters/test/bloom_filter_serialize_for_java.cpp index 00293886..082e5b2a 100644 --- a/filters/test/bloom_filter_serialize_for_java.cpp +++ b/filters/test/bloom_filter_serialize_for_java.cpp @@ -33,6 +33,7 @@ TEST_CASE("bloom filter generate", "[serialize_for_java]") { const uint64_t config_bits = std::max(n, static_cast(1000)); // so empty still has valid bit size bloom_filter bf = bloom_filter_builder::create_by_size(config_bits, num_hashes); for (uint64_t i = 0; i < n / 10; ++i) bf.update(i); // note: n / 10 items into n bits + if (n > 0) bf.update(std::nan("1")); // include a NaN if non-empty REQUIRE(bf.is_empty() == (n == 0)); REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10))); std::ofstream os("bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.filter", std::ios::binary); From 533b6b9520fbd0b926dd867840822302faed65b7 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:12:07 -0700 Subject: [PATCH 22/30] use .sk suffix on cross-lang filter binaries to avoid workflow changes --- filters/test/bloom_filter_deserialize_from_java_test.cpp | 2 +- filters/test/bloom_filter_serialize_for_java.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/filters/test/bloom_filter_deserialize_from_java_test.cpp b/filters/test/bloom_filter_deserialize_from_java_test.cpp index 255d1f78..f2dee18c 100644 --- a/filters/test/bloom_filter_deserialize_from_java_test.cpp +++ b/filters/test/bloom_filter_deserialize_from_java_test.cpp @@ -35,7 +35,7 @@ TEST_CASE("bloom_filter", "[serde_compat]") { for (const uint16_t num_hashes: h_arr) { std::ifstream is; is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.filter", std::ios::binary); + is.open(testBinaryInputPath + "bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.sk", std::ios::binary); auto bf = bloom_filter::deserialize(is); REQUIRE(bf.is_empty() == (n == 0)); REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10))); diff --git a/filters/test/bloom_filter_serialize_for_java.cpp b/filters/test/bloom_filter_serialize_for_java.cpp index 082e5b2a..be4e62b1 100644 --- a/filters/test/bloom_filter_serialize_for_java.cpp +++ b/filters/test/bloom_filter_serialize_for_java.cpp @@ -36,7 +36,7 @@ TEST_CASE("bloom filter generate", "[serialize_for_java]") { if (n > 0) bf.update(std::nan("1")); // include a NaN if non-empty REQUIRE(bf.is_empty() == (n == 0)); REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10))); - std::ofstream os("bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.filter", std::ios::binary); + std::ofstream os("bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.sk", std::ios::binary); bf.serialize(os); } } From ecc856bfbb2b93a5af27add912a2bddb7673c86d Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Wed, 14 Aug 2024 12:58:38 -0700 Subject: [PATCH 23/30] Add class-level docs to bloom filter (and builder) and include the serialization format in the impl file --- filters/include/bloom_filter.hpp | 33 +++++++++++++++++++++++++++ filters/include/bloom_filter_impl.hpp | 23 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index e73170d0..71b170e0 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -37,6 +37,14 @@ template class bloom_filter_builder_alloc; using bloom_filter = bloom_filter_alloc>; using bloom_filter_builder = bloom_filter_builder_alloc>; +/** + *

This class provides methods to help estimate the correct parameters when + * creating a Bloom filter, and methods to create the filter using those values.

+ * + *

The underlying math is described in the + * + * Wikipedia article on Bloom filters.

+ */ template> class bloom_filter_builder_alloc { using A = Allocator; @@ -149,6 +157,31 @@ class bloom_filter_builder_alloc { static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob); }; +/** + *

A Bloom filter is a data structure that can be used for probabilistic + * set membership.

+ * + *

When querying a Bloom filter, there are no false positives. Specifically: + * When querying an item that has already been inserted to the filter, the filter will + * always indicate that the item is present. There is a chance of false positives, where + * querying an item that has never been presented to the filter will indicate that the + * item has already been seen. Consequently, any query should be interpreted as + * "might have seen."

+ * + *

A standard Bloom filter is unlike typical sketches in that it is not sub-linear + * in size and does not resize itself. A Bloom filter will work up to a target number of + * distinct items, beyond which it will saturate and the false positive rate will start to + * increase. The size of a Bloom filter will be linear in the expected number of + * distinct items.

+ * + *

See the bloom_filter_builder_alloc class for methods to create a filter, especially + * one sized correctly for a target number of distinct elements and a target + * false positive probability.

+ * + *

This implementation uses xxHash64 and follows the approach in Kirsch and Mitzenmacher, + * "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley Interscience, 2008, pp. 187-218.

+ */ + template> class bloom_filter_alloc { using A = Allocator; diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index fab4be7b..b8c47a9b 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -244,6 +244,29 @@ bloom_filter_alloc bloom_filter_alloc::deserialize(const void* bytes, size return internal_deserialize_or_wrap(const_cast(bytes), length_bytes, false, false, allocator); } +/* + * A Bloom Filter's serialized image always uses 3 longs of preamble when empty, + * otherwise 4 longs: + * + *
+ * Long || Start Byte Adr:
+ * Adr:
+ *      ||       0        |    1   |    2   |    3   |    4   |    5   |    6   |    7   |
+ *  0   || Preamble_Longs | SerVer | FamID  |  Flags |----Num Hashes---|-----Unused------|
+ *
+ *      ||       8        |    9   |   10   |   11   |   12   |   13   |   14   |   15   |
+ *  1   ||---------------------------------Hash Seed-------------------------------------|
+ *
+ *      ||      16        |   17   |   18   |   19   |   20   |   21   |   22   |   23   |
+ *  2   ||-------BitArray Length (in longs)----------|-----------Unused------------------|
+ *
+ *      ||      24        |   25   |   26   |   27   |   28   |   29   |   30   |   31   |
+ *  3   ||---------------------------------NumBitsSet------------------------------------|
+ *  
+ * + * The raw BitArray bits, if non-empty start at byte 32. + */ + template bloom_filter_alloc
bloom_filter_alloc::deserialize(std::istream& is, const A& allocator) { const uint8_t prelongs = read(is); From 5e62bc3b0634dcbfc3c17fa46aa223c7da0bbad1 Mon Sep 17 00:00:00 2001 From: Jon Date: Wed, 14 Aug 2024 23:53:24 -0700 Subject: [PATCH 24/30] Address most review feedback --- common/include/common_defs.hpp | 1 + filters/include/bit_array_ops.hpp | 6 ++--- filters/include/bloom_filter.hpp | 24 +++++++++---------- filters/include/bloom_filter_builder_impl.hpp | 7 +----- filters/include/bloom_filter_impl.hpp | 8 +++---- ...loom_filter_deserialize_from_java_test.cpp | 2 +- filters/test/bloom_filter_test.cpp | 2 +- 7 files changed, 23 insertions(+), 27 deletions(-) diff --git a/common/include/common_defs.hpp b/common/include/common_defs.hpp index 8a61ff30..6a87e079 100644 --- a/common/include/common_defs.hpp +++ b/common/include/common_defs.hpp @@ -42,6 +42,7 @@ namespace random_utils { static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2 static thread_local std::mt19937_64 rand(rd()); static thread_local std::uniform_real_distribution<> next_double(0.0, 1.0); + static thread_local std::uniform_int_distribution next_uint64(0, UINT64_MAX); // thread-safe random bit static thread_local std::independent_bits_engine diff --git a/filters/include/bit_array_ops.hpp b/filters/include/bit_array_ops.hpp index cc1a42fa..2ccfe00f 100644 --- a/filters/include/bit_array_ops.hpp +++ b/filters/include/bit_array_ops.hpp @@ -69,7 +69,7 @@ namespace bit_array_ops { * @param array the array of bits * @param index the index of the bit to set. */ - static inline void assign_bit(uint8_t* array, const uint64_t index, const bool value) { + static inline void assign_bit(uint8_t* array, uint64_t index, bool value) { // read-only checks handled by set_bit() and clear_bit() if (value) { set_bit(array, index); @@ -79,12 +79,12 @@ namespace bit_array_ops { } /** - * Gets teh value of a bit at the specified index and sets it to true + * Gets the value of a bit at the specified index and sets it to true * @param array the array of bits * @param index the index of the bit to get and set * @return the value of the bit at the specified index */ - static inline bool get_and_set_bit(uint8_t* array, const uint64_t index) { + static inline bool get_and_set_bit(uint8_t* array, uint64_t index) { const uint64_t offset = index >> 3; const uint8_t mask = 1 << (index & 7); if ((array[offset] & mask) != 0) { diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index 71b170e0..cf2059c9 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -203,7 +203,7 @@ class bloom_filter_alloc { * @param allocator instance of an Allocator * @return an instance of a Bloom filter */ - static bloom_filter_alloc deserialize(std::istream& is, const A& allocator = Allocator()); + static bloom_filter_alloc deserialize(std::istream& is, const Allocator& allocator = Allocator()); /** * @brief Wraps the provided memory as a read-only Bloom filter. Reads the data in-place and does @@ -231,12 +231,12 @@ class bloom_filter_alloc { * Copy constructor * @param other filter to be copied */ - bloom_filter_alloc(const bloom_filter_alloc&); + bloom_filter_alloc(const bloom_filter_alloc& other); /** Move constructor * @param other filter to be moved */ - bloom_filter_alloc(bloom_filter_alloc&&) noexcept; + bloom_filter_alloc(bloom_filter_alloc&& other) noexcept; /** * Copy assignment @@ -253,7 +253,7 @@ class bloom_filter_alloc { bloom_filter_alloc& operator=(bloom_filter_alloc&& other); /** - * @brief Destroy the bloom filter alloc object + * @brief Destroy the bloom filter object */ ~bloom_filter_alloc(); @@ -265,7 +265,7 @@ class bloom_filter_alloc { * This method serializes the filter as a vector of bytes. * An optional header can be reserved in front of the filter. * It is a blank space of a given size. - * This header is used in Datasketches PostgreSQL extension. + * Some integrations such as PostgreSQL may need this header space. * @param header_size_bytes space to reserve in front of the filter * @return serialized filter as a vector of bytes */ @@ -658,17 +658,17 @@ class bloom_filter_alloc { bool is_memory_owned() const; /** - * @brief Checks if the Bloom Filter has backing memory. + * @brief Checks if the Bloom Filter was created by a call to wrap(). * - * @return True if the filter has backing memory, otherwise false. + * @return True if the filter was created by wrapping memory, otherwise false. */ - bool has_backing_memory() const; + bool is_wrapped() const; /** - * @brief Returns a pointer to the backing memory, if it exists. - * @return A pointer to the backing memory, or nullptr if it does not exist. + * @brief Returns a pointer to the memory this filter wraps, if it exists. + * @return A pointer to the wrapped memory, or nullptr if is_wrapped() is false. */ - const uint8_t* get_backing_memory() const; + const uint8_t* get_wrapped_memory() const; /** * @brief Gets the serialized size of the Bloom Filter in bytes @@ -739,7 +739,7 @@ class bloom_filter_alloc { uint64_t seed_; uint16_t num_hashes_; bool is_dirty_; - bool is_owned_; // if true, data is not owned by filter AND data_ holdes the entire filter not just the bit array + bool is_owned_; // if true, data is not owned by filter AND memory_ holds the entire filter not just the bit array bool is_read_only_; // if true, filter is read-only uint64_t capacity_bits_; uint64_t num_bits_set_; diff --git a/filters/include/bloom_filter_builder_impl.hpp b/filters/include/bloom_filter_builder_impl.hpp index 5b34767a..b2050730 100644 --- a/filters/include/bloom_filter_builder_impl.hpp +++ b/filters/include/bloom_filter_builder_impl.hpp @@ -31,12 +31,7 @@ namespace datasketches { template uint64_t bloom_filter_builder_alloc::generate_random_seed() { - union { - uint64_t long_value; - double double_value; - } ldu; - ldu.double_value = random_utils::next_double(random_utils::rand); - return ldu.long_value; + return random_utils::next_uint64(random_utils::rand); } template diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index b8c47a9b..0195eb5d 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -182,8 +182,8 @@ bloom_filter_alloc::bloom_filter_alloc(bloom_filter_alloc&& other) noexcept : is_read_only_(other.is_read_only_), capacity_bits_(other.capacity_bits_), num_bits_set_(other.num_bits_set_), - bit_array_(std::move(other.bit_array_)), - memory_(std::move(other.memory_)) + bit_array_(other.bit_array_), + memory_(other.memory_) { // ensure destructor on other will behave nicely other.is_owned_ = false; @@ -504,7 +504,7 @@ bool bloom_filter_alloc::is_read_only() const { } template -bool bloom_filter_alloc::has_backing_memory() const { +bool bloom_filter_alloc::is_wrapped() const { return memory_ != nullptr; } @@ -514,7 +514,7 @@ bool bloom_filter_alloc::is_memory_owned() const { } template -const uint8_t* bloom_filter_alloc::get_backing_memory() const { +const uint8_t* bloom_filter_alloc::get_wrapped_memory() const { return memory_; } diff --git a/filters/test/bloom_filter_deserialize_from_java_test.cpp b/filters/test/bloom_filter_deserialize_from_java_test.cpp index f2dee18c..259eb4ff 100644 --- a/filters/test/bloom_filter_deserialize_from_java_test.cpp +++ b/filters/test/bloom_filter_deserialize_from_java_test.cpp @@ -35,7 +35,7 @@ TEST_CASE("bloom_filter", "[serde_compat]") { for (const uint16_t num_hashes: h_arr) { std::ifstream is; is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_cpp.sk", std::ios::binary); + is.open(testBinaryInputPath + "bf_n" + std::to_string(n) + "_h" + std::to_string(num_hashes) + "_java.sk", std::ios::binary); auto bf = bloom_filter::deserialize(is); REQUIRE(bf.is_empty() == (n == 0)); REQUIRE((bf.is_empty() || (bf.get_bits_used() > n / 10))); diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index df22cadf..a1e57d1f 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -131,7 +131,7 @@ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") { } // check that raw memory also matches serialized sketch - const uint8_t* bf_bytes = bf2.get_backing_memory(); + const uint8_t* bf_bytes = bf2.get_wrapped_memory(); REQUIRE(bf_bytes == bf_memory); for (size_t i = 0; i < bytes.size(); ++i) { REQUIRE(bf_bytes[i] == bytes[i]); From 9d53c6c0fba12caa3258995ea467b1f49806a633 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Thu, 15 Aug 2024 23:47:07 -0700 Subject: [PATCH 25/30] remove const from copied primitives in method signatures --- filters/include/bit_array_ops.hpp | 20 +++++----- filters/include/bloom_filter.hpp | 28 ++++++------- filters/include/bloom_filter_builder_impl.hpp | 40 +++++++++---------- 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/filters/include/bit_array_ops.hpp b/filters/include/bit_array_ops.hpp index cc1a42fa..263678dc 100644 --- a/filters/include/bit_array_ops.hpp +++ b/filters/include/bit_array_ops.hpp @@ -42,7 +42,7 @@ namespace bit_array_ops { * @param index the index of the bit to get * @return the value of the bit at the given index. */ - static inline bool get_bit(uint8_t* array, const uint64_t index) { + static inline bool get_bit(uint8_t* array, uint64_t index) { return (array[index >> 3] & (1 << (index & 7))) != 0; } @@ -51,7 +51,7 @@ namespace bit_array_ops { * @param array the array of bits * @param index the index of the bit to set. */ - static inline void set_bit(uint8_t* array, const uint64_t index) { + static inline void set_bit(uint8_t* array, uint64_t index) { array[index >> 3] |= (1 << (index & 7)); } @@ -60,7 +60,7 @@ namespace bit_array_ops { * @param array the array of bits * @param index the index of the bit to clear. */ - static inline void clear_bit(uint8_t* array, const uint64_t index) { + static inline void clear_bit(uint8_t* array, uint64_t index) { array[index >> 3] &= ~(1 << (index & 7)); } @@ -69,7 +69,7 @@ namespace bit_array_ops { * @param array the array of bits * @param index the index of the bit to set. */ - static inline void assign_bit(uint8_t* array, const uint64_t index, const bool value) { + static inline void assign_bit(uint8_t* array, uint64_t index, bool value) { // read-only checks handled by set_bit() and clear_bit() if (value) { set_bit(array, index); @@ -84,7 +84,7 @@ namespace bit_array_ops { * @param index the index of the bit to get and set * @return the value of the bit at the specified index */ - static inline bool get_and_set_bit(uint8_t* array, const uint64_t index) { + static inline bool get_and_set_bit(uint8_t* array, uint64_t index) { const uint64_t offset = index >> 3; const uint8_t mask = 1 << (index & 7); if ((array[offset] & mask) != 0) { @@ -101,7 +101,7 @@ namespace bit_array_ops { * @param length_bytes the length of the array, in bytes * @return the number of bits set in the bit array. */ - static inline uint64_t count_num_bits_set(uint8_t* array, const uint64_t length_bytes) { + static inline uint64_t count_num_bits_set(uint8_t* array, uint64_t length_bytes) { uint64_t num_bits_set = 0; // we rounded up to a multiple of 64 so we know we can use 64-bit operations @@ -126,7 +126,7 @@ namespace bit_array_ops { * @param length_bytes the length of the two arrays, in bytes * @return the number of bits set in the resulting array */ - static inline uint64_t union_with(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes) { + static inline uint64_t union_with(uint8_t* tgt, const uint8_t* src, uint64_t length_bytes) { uint64_t num_bits_set = 0; for (uint64_t i = 0; i < length_bytes; ++i) { tgt[i] |= src[i]; @@ -146,7 +146,7 @@ namespace bit_array_ops { * @param length_bytes the length of the two arrays, in bytes * @return the number of bits set in the resulting array */ - static inline uint64_t intersect(uint8_t* tgt, const uint8_t* src, const uint64_t length_bytes) { + static inline uint64_t intersect(uint8_t* tgt, const uint8_t* src, uint64_t length_bytes) { uint64_t num_bits_set = 0; for (uint64_t i = 0; i < length_bytes; ++i) { tgt[i] &= src[i]; @@ -163,7 +163,7 @@ namespace bit_array_ops { * @param length_bytes the length of the array, in bytes * @return the number of bits set in the resulting array */ - static inline uint64_t invert(uint8_t* array, const uint64_t length_bytes) { + static inline uint64_t invert(uint8_t* array, uint64_t length_bytes) { uint64_t num_bits_set = 0; for (uint64_t i = 0; i < length_bytes; ++i) { array[i] = ~array[i]; @@ -177,4 +177,4 @@ namespace bit_array_ops { } // namespace datasketches -#endif // _BIT_ARRAY_OPS_HPP_ \ No newline at end of file +#endif // _BIT_ARRAY_OPS_HPP_ diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index 71b170e0..1de09e63 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -681,7 +681,7 @@ class bloom_filter_alloc { * @param num_bits The number of bits in the Bloom Filter for the size calculation * @return The serialized size of a Bloom Filter with a capacity of num_bits, in bytes */ - static size_t get_serialized_size_bytes(const uint64_t num_bits); + static size_t get_serialized_size_bytes(uint64_t num_bits); /** * @brief Returns a human-readable string representation of the Bloom Filter. @@ -707,17 +707,17 @@ class bloom_filter_alloc { static const uint8_t EMPTY_FLAG_MASK = 4; // used by builder methods - bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const A& allocator); - bloom_filter_alloc(uint8_t* memory, size_t length_bytes, const uint64_t num_bits, const uint16_t num_hashes, const uint64_t seed, const A& allocator); + bloom_filter_alloc(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator); + bloom_filter_alloc(uint8_t* memory, size_t length_bytes, uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator); // used by deserialize and wrap - bloom_filter_alloc(const uint64_t seed, - const uint16_t num_hashes, - const bool is_dirty, - const bool is_owned, - const bool is_read_only, - const uint64_t capacity_bits, - const uint64_t num_bits_set, + bloom_filter_alloc(uint64_t seed, + uint16_t num_hashes, + bool is_dirty, + bool is_owned, + bool is_read_only, + uint64_t capacity_bits, + uint64_t num_bits_set, uint8_t* bit_array, uint8_t* memory, const A& allocator); @@ -729,11 +729,11 @@ class bloom_filter_alloc { const A& allocator); // internal query/update methods - void internal_update(const uint64_t h0, const uint64_t h1); - bool internal_query_and_update(const uint64_t h0, const uint64_t h1); - bool internal_query(const uint64_t h0, const uint64_t h1) const; + void internal_update(uint64_t h0, uint64_t h1); + bool internal_query_and_update(uint64_t h0, uint64_t h1); + bool internal_query(uint64_t h0, uint64_t h1) const; - void update_num_bits_set(const uint64_t num_bits_set); + void update_num_bits_set(uint64_t num_bits_set); A allocator_; uint64_t seed_; diff --git a/filters/include/bloom_filter_builder_impl.hpp b/filters/include/bloom_filter_builder_impl.hpp index 5b34767a..da637a68 100644 --- a/filters/include/bloom_filter_builder_impl.hpp +++ b/filters/include/bloom_filter_builder_impl.hpp @@ -40,8 +40,8 @@ uint64_t bloom_filter_builder_alloc::generate_random_seed() { } template -uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const uint64_t max_distinct_items, - const uint64_t num_filter_bits) { +uint16_t bloom_filter_builder_alloc::suggest_num_hashes(uint64_t max_distinct_items, + uint64_t num_filter_bits) { if (max_distinct_items == 0) { throw std::invalid_argument("maximum number of distinct items must be strictly positive"); } @@ -54,22 +54,22 @@ uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const uint64_t max_di } template -uint16_t bloom_filter_builder_alloc::suggest_num_hashes(const double target_false_positive_prob) { +uint16_t bloom_filter_builder_alloc::suggest_num_hashes(double target_false_positive_prob) { validate_accuracy_inputs(100, target_false_positive_prob); // max_distinct_items is an arbitrary valid value return static_cast(std::ceil(-log(target_false_positive_prob) / log(2.0))); } template -uint64_t bloom_filter_builder_alloc::suggest_num_filter_bits(const uint64_t max_distinct_items, - const double target_false_positive_prob) { +uint64_t bloom_filter_builder_alloc::suggest_num_filter_bits(uint64_t max_distinct_items, + double target_false_positive_prob) { validate_accuracy_inputs(max_distinct_items, target_false_positive_prob); return static_cast(std::ceil(-static_cast(max_distinct_items) * log(target_false_positive_prob) / (log(2.0) * log(2.0)))); } template -bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(const uint64_t max_distinct_items, - const double target_false_positive_prob, - const uint64_t seed, +bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(uint64_t max_distinct_items, + double target_false_positive_prob, + uint64_t seed, const A& allocator) { validate_accuracy_inputs(max_distinct_items, target_false_positive_prob); const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob); @@ -78,9 +78,9 @@ bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(const ui } template -bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(const uint64_t num_bits, - const uint16_t num_hashes, - const uint64_t seed, +bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(uint64_t num_bits, + uint16_t num_hashes, + uint64_t seed, const A& allocator) { validate_size_inputs(num_bits, num_hashes); return bloom_filter_alloc(num_bits, num_hashes, seed, allocator); @@ -88,10 +88,10 @@ bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(const uint64 template bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_accuracy(void* memory, - const size_t length_bytes, - const uint64_t max_distinct_items, - const double target_false_positive_prob, - const uint64_t seed, + size_t length_bytes, + uint64_t max_distinct_items, + double target_false_positive_prob, + uint64_t seed, const A& allocator) { validate_accuracy_inputs(max_distinct_items, target_false_positive_prob); const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob); @@ -101,10 +101,10 @@ bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_accuracy(void template bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_size(void* memory, - const size_t length_bytes, - const uint64_t num_bits, - const uint16_t num_hashes, - const uint64_t seed, + size_t length_bytes, + uint64_t num_bits, + uint16_t num_hashes, + uint64_t seed, const A& allocator) { validate_size_inputs(num_bits, num_hashes); return bloom_filter_alloc(static_cast(memory), length_bytes, num_bits, num_hashes, seed, allocator); @@ -134,4 +134,4 @@ void bloom_filter_builder_alloc::validate_accuracy_inputs(uint64_t max_distin } // namespace datasketches -#endif // _BLOOM_FILTER_BUILDER_IMPL_HPP_ \ No newline at end of file +#endif // _BLOOM_FILTER_BUILDER_IMPL_HPP_ From 1705e89d787a8f3af481cc70093af240f1bab88d Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Thu, 15 Aug 2024 23:59:18 -0700 Subject: [PATCH 26/30] apparently i clobbered my own const removal edits.. --- filters/CMakeLists.txt | 2 +- filters/include/bloom_filter_impl.hpp | 96 +++++++++++++-------------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/filters/CMakeLists.txt b/filters/CMakeLists.txt index 66bce141..eee081d1 100644 --- a/filters/CMakeLists.txt +++ b/filters/CMakeLists.txt @@ -17,7 +17,7 @@ add_library(filters INTERFACE) -add_library(${PROJECT_NAME}::FILTER ALIAS filters) +add_library(${PROJECT_NAME}::FILTERS ALIAS filters) if (BUILD_TESTS) add_subdirectory(test) diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index 0195eb5d..f82ea341 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -37,7 +37,7 @@ namespace datasketches { template -bloom_filter_alloc::bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, uint64_t seed, const A& allocator) : +bloom_filter_alloc::bloom_filter_alloc(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator) : allocator_(allocator), seed_(seed), num_hashes_(num_hashes), @@ -68,9 +68,9 @@ bloom_filter_alloc::bloom_filter_alloc(const uint64_t num_bits, const uint16_ template bloom_filter_alloc::bloom_filter_alloc(uint8_t* memory, size_t length_bytes, - const uint64_t num_bits, - const uint16_t num_hashes, - const uint64_t seed, + uint64_t num_bits, + uint16_t num_hashes, + uint64_t seed, const A& allocator) : allocator_(allocator), seed_(seed), @@ -119,13 +119,13 @@ bloom_filter_alloc::bloom_filter_alloc(uint8_t* memory, } template -bloom_filter_alloc::bloom_filter_alloc(const uint64_t seed, - const uint16_t num_hashes, - const bool is_dirty, - const bool is_owned, - const bool is_read_only, - const uint64_t capacity_bits, - const uint64_t num_bits_set, +bloom_filter_alloc::bloom_filter_alloc(uint64_t seed, + uint16_t num_hashes, + bool is_dirty, + bool is_owned, + bool is_read_only, + uint64_t capacity_bits, + uint64_t num_bits_set, uint8_t* bit_array, uint8_t* memory, const A& allocator) : @@ -182,8 +182,8 @@ bloom_filter_alloc::bloom_filter_alloc(bloom_filter_alloc&& other) noexcept : is_read_only_(other.is_read_only_), capacity_bits_(other.capacity_bits_), num_bits_set_(other.num_bits_set_), - bit_array_(other.bit_array_), - memory_(other.memory_) + bit_array_(std::move(other.bit_array_)), + memory_(std::move(other.memory_)) { // ensure destructor on other will behave nicely other.is_owned_ = false; @@ -461,7 +461,7 @@ size_t bloom_filter_alloc::get_serialized_size_bytes() const { } template -size_t bloom_filter_alloc::get_serialized_size_bytes(const uint64_t num_bits) { +size_t bloom_filter_alloc::get_serialized_size_bytes(uint64_t num_bits) { if (num_bits == 0) throw std::invalid_argument("Number of bits must be greater than zero"); @@ -539,7 +539,7 @@ void bloom_filter_alloc::update_num_bits_set(uint64_t num_bits_set) { // UPDATE METHODS template -void bloom_filter_alloc::update(const std::string& item) { +void bloom_filter_alloc::update(std::string& item) { if (item.empty()) return; const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_); const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0); @@ -547,51 +547,51 @@ void bloom_filter_alloc::update(const std::string& item) { } template -void bloom_filter_alloc::update(const uint64_t item) { +void bloom_filter_alloc::update(uint64_t item) { const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); internal_update(h0, h1); } template -void bloom_filter_alloc::update(const uint32_t item) { +void bloom_filter_alloc::update(uint32_t item) { update(static_cast(item)); } template -void bloom_filter_alloc::update(const uint16_t item) { +void bloom_filter_alloc::update(uint16_t item) { update(static_cast(item)); } template -void bloom_filter_alloc::update(const uint8_t item) { +void bloom_filter_alloc::update(uint8_t item) { update(static_cast(item)); } template -void bloom_filter_alloc::update(const int64_t item) { +void bloom_filter_alloc::update(int64_t item) { const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); internal_update(h0, h1); } template -void bloom_filter_alloc::update(const int32_t item) { +void bloom_filter_alloc::update(int32_t item) { update(static_cast(item)); } template -void bloom_filter_alloc::update(const int16_t item) { +void bloom_filter_alloc::update(int16_t item) { update(static_cast(item)); } template -void bloom_filter_alloc::update(const int8_t item) { +void bloom_filter_alloc::update(int8_t item) { update(static_cast(item)); } template -void bloom_filter_alloc::update(const double item) { +void bloom_filter_alloc::update(double item) { union { int64_t long_value; double double_value; @@ -608,7 +608,7 @@ void bloom_filter_alloc::update(const double item) { } template -void bloom_filter_alloc::update(const float item) { +void bloom_filter_alloc::update(float item) { update(static_cast(item)); } @@ -621,7 +621,7 @@ void bloom_filter_alloc::update(const void* item, size_t size) { } template -void bloom_filter_alloc::internal_update(const uint64_t h0, const uint64_t h1) { +void bloom_filter_alloc::internal_update(uint64_t h0, uint64_t h1) { if (is_read_only_) { throw std::logic_error("Cannot update a read-only filter"); } @@ -644,51 +644,51 @@ bool bloom_filter_alloc::query_and_update(const std::string& item) { } template -bool bloom_filter_alloc::query_and_update(const uint64_t item) { +bool bloom_filter_alloc::query_and_update(uint64_t item) { const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); return internal_query_and_update(h0, h1); } template -bool bloom_filter_alloc::query_and_update(const uint32_t item) { +bool bloom_filter_alloc::query_and_update(uint32_t item) { return query_and_update(static_cast(item)); } template -bool bloom_filter_alloc::query_and_update(const uint16_t item) { +bool bloom_filter_alloc::query_and_update(uint16_t item) { return query_and_update(static_cast(item)); } template -bool bloom_filter_alloc::query_and_update(const uint8_t item) { +bool bloom_filter_alloc::query_and_update(uint8_t item) { return query_and_update(static_cast(item)); } template -bool bloom_filter_alloc::query_and_update(const int64_t item) { +bool bloom_filter_alloc::query_and_update(int64_t item) { const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); return internal_query_and_update(h0, h1); } template -bool bloom_filter_alloc::query_and_update(const int32_t item) { +bool bloom_filter_alloc::query_and_update(int32_t item) { return query_and_update(static_cast(item)); } template -bool bloom_filter_alloc::query_and_update(const int16_t item) { +bool bloom_filter_alloc::query_and_update(int16_t item) { return query_and_update(static_cast(item)); } template -bool bloom_filter_alloc::query_and_update(const int8_t item) { +bool bloom_filter_alloc::query_and_update(int8_t item) { return query_and_update(static_cast(item)); } template -bool bloom_filter_alloc::query_and_update(const double item) { +bool bloom_filter_alloc::query_and_update(double item) { union { int64_t long_value; double double_value; @@ -705,7 +705,7 @@ bool bloom_filter_alloc::query_and_update(const double item) { } template -bool bloom_filter_alloc::query_and_update(const float item) { +bool bloom_filter_alloc::query_and_update(float item) { return query_and_update(static_cast(item)); } @@ -718,7 +718,7 @@ bool bloom_filter_alloc::query_and_update(const void* item, size_t size) { } template -bool bloom_filter_alloc::internal_query_and_update(const uint64_t h0, const uint64_t h1) { +bool bloom_filter_alloc::internal_query_and_update(uint64_t h0, uint64_t h1) { if (is_read_only_) { throw std::logic_error("Cannot update a read-only filter"); } @@ -744,51 +744,51 @@ bool bloom_filter_alloc::query(const std::string& item) const { } template -bool bloom_filter_alloc::query(const uint64_t item) const { +bool bloom_filter_alloc::query(uint64_t item) const { const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); return internal_query(h0, h1); } template -bool bloom_filter_alloc::query(const uint32_t item) const { +bool bloom_filter_alloc::query(uint32_t item) const { return query(static_cast(item)); } template -bool bloom_filter_alloc::query(const uint16_t item) const { +bool bloom_filter_alloc::query(uint16_t item) const { return query(static_cast(item)); } template -bool bloom_filter_alloc::query(const uint8_t item) const { +bool bloom_filter_alloc::query(uint8_t item) const { return query(static_cast(item)); } template -bool bloom_filter_alloc::query(const int64_t item) const { +bool bloom_filter_alloc::query(int64_t item) const { const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_); const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0); return internal_query(h0, h1); } template -bool bloom_filter_alloc::query(const int32_t item) const { +bool bloom_filter_alloc::query(int32_t item) const { return query(static_cast(item)); } template -bool bloom_filter_alloc::query(const int16_t item) const { +bool bloom_filter_alloc::query(int16_t item) const { return query(static_cast(item)); } template -bool bloom_filter_alloc::query(const int8_t item) const { +bool bloom_filter_alloc::query(int8_t item) const { return query(static_cast(item)); } template -bool bloom_filter_alloc::query(const double item) const { +bool bloom_filter_alloc::query(double item) const { union { int64_t long_value; double double_value; @@ -805,7 +805,7 @@ bool bloom_filter_alloc::query(const double item) const { } template -bool bloom_filter_alloc::query(const float item) const { +bool bloom_filter_alloc::query(float item) const { return query(static_cast(item)); } @@ -818,7 +818,7 @@ bool bloom_filter_alloc::query(const void* item, size_t size) const { } template -bool bloom_filter_alloc::internal_query(const uint64_t h0, const uint64_t h1) const { +bool bloom_filter_alloc::internal_query(uint64_t h0, uint64_t h1) const { if (is_empty()) return false; const uint64_t num_bits = get_capacity(); for (uint16_t i = 1; i <= num_hashes_; i++) { From 1ac743fa666a80879a5cf30e1859941814859afc Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Fri, 16 Aug 2024 00:35:56 -0700 Subject: [PATCH 27/30] managed to conflict with myself when jumping between boxes. resolved now. --- filters/include/bloom_filter.hpp | 57 +++++++++++++-------------- filters/include/bloom_filter_impl.hpp | 2 +- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index 78396277..a38d0413 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -47,8 +47,6 @@ using bloom_filter_builder = bloom_filter_builder_alloc> */ template> class bloom_filter_builder_alloc { - using A = Allocator; - public: /** * Returns the optimal number of hash functions to given target numbers of distinct items @@ -58,14 +56,14 @@ class bloom_filter_builder_alloc { * @param num_filter_bits The intended size of the Bloom Filter in bits * @return The suggested number of hash functions to use with the filter */ - static uint16_t suggest_num_hashes(const uint64_t max_distinct_items, const uint64_t num_filter_bits); + static uint16_t suggest_num_hashes(uint64_t max_distinct_items, uint64_t num_filter_bits); /** * Returns the optimal number of hash functions to achieve a target false positive probability. * @param target_false_positive_prob A desired false positive probability per item * @return The suggested number of hash functions to use with the filter. */ - static uint16_t suggest_num_hashes(const double target_false_positive_prob); + static uint16_t suggest_num_hashes(double target_false_positive_prob); /** * Returns the optimal number of bits to use in a Bloom filter given a target number of distinct @@ -74,7 +72,7 @@ class bloom_filter_builder_alloc { * @param target_false_positive_prob A desired false positive probability per item * @return The suggested number of bits to use with the filter */ - static uint64_t suggest_num_filter_bits(const uint64_t max_distinct_items, const double target_false_positive_prob); + static uint64_t suggest_num_filter_bits(uint64_t max_distinct_items, double target_false_positive_prob); /** * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, @@ -85,10 +83,10 @@ class bloom_filter_builder_alloc { * @param allocator The allocator to use for the filter (default: standard allocator) * @return A new Bloom filter configured for the given input parameters */ - static bloom_filter_alloc create_by_accuracy(const uint64_t max_distinct_items, - const double target_false_positive_prob, - const uint64_t seed = generate_random_seed(), - const Allocator& allocator = Allocator()); + static bloom_filter_alloc create_by_accuracy(uint64_t max_distinct_items, + double target_false_positive_prob, + uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); /** * Creates a Bloom filter with given number of bits and number of hash functions, @@ -100,10 +98,10 @@ class bloom_filter_builder_alloc { * @param allocator The allocator to use for the filter (default: standard allocator) * @return A new Bloom filter configured for the given input parameters */ - static bloom_filter_alloc create_by_size(const uint64_t num_bits, - const uint16_t num_hashes, - const uint64_t seed = generate_random_seed(), - const Allocator& allocator = Allocator()); + static bloom_filter_alloc create_by_size(uint64_t num_bits, + uint16_t num_hashes, + uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); /** * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, @@ -118,12 +116,12 @@ class bloom_filter_builder_alloc { * @param allocator The allocator to use for the filter (default: standard allocator) * @return A new Bloom filter configured for the given input parameters in the provided memory */ - static bloom_filter_alloc initialize_by_accuracy(void* memory, - const size_t length_bytes, - const uint64_t max_distinct_items, - const double target_false_positive_prob, - const uint64_t seed = generate_random_seed(), - const Allocator& allocator = Allocator()); + static bloom_filter_alloc initialize_by_accuracy(void* memory, + size_t length_bytes, + uint64_t max_distinct_items, + double target_false_positive_prob, + uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); /** * Initializes a Bloom filter with given number of bits and number of hash functions, @@ -138,12 +136,12 @@ class bloom_filter_builder_alloc { * @param allocator The allocator to use for the filter (default: standard allocator) * @return A new BloomFilter configured for the given input parameters */ - static bloom_filter_alloc initialize_by_size(void* memory, - const size_t length_bytes, - const uint64_t num_bits, - const uint16_t num_hashes, - const uint64_t seed = generate_random_seed(), - const Allocator& allocator = Allocator()); + static bloom_filter_alloc initialize_by_size(void* memory, + size_t length_bytes, + uint64_t num_bits, + uint16_t num_hashes, + uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); /** * @brief Generates a random 64-bit seed value @@ -184,8 +182,6 @@ class bloom_filter_builder_alloc { template> class bloom_filter_alloc { - using A = Allocator; - public: /** @@ -259,7 +255,7 @@ class bloom_filter_alloc { // This is a convenience alias for users // The type returned by the following serialize method - using vector_bytes = std::vector::template rebind_alloc>; + using vector_bytes = std::vector::template rebind_alloc>; /** * This method serializes the filter as a vector of bytes. @@ -688,9 +684,10 @@ class bloom_filter_alloc { * @param print_filter If true, the filter bits will be printed as well. * @return A human-readable string representation of the Bloom Filter. */ - string to_string(bool print_filter = false) const; + string to_string(bool print_filter = false) const; private: + using A = Allocator; using AllocUint8 = typename std::allocator_traits::template rebind_alloc; static const uint64_t DIRTY_BITS_VALUE = static_cast(-1LL); @@ -735,7 +732,7 @@ class bloom_filter_alloc { void update_num_bits_set(uint64_t num_bits_set); - A allocator_; + Allocator allocator_; uint64_t seed_; uint16_t num_hashes_; bool is_dirty_; diff --git a/filters/include/bloom_filter_impl.hpp b/filters/include/bloom_filter_impl.hpp index f82ea341..dc022ba4 100644 --- a/filters/include/bloom_filter_impl.hpp +++ b/filters/include/bloom_filter_impl.hpp @@ -539,7 +539,7 @@ void bloom_filter_alloc::update_num_bits_set(uint64_t num_bits_set) { // UPDATE METHODS template -void bloom_filter_alloc::update(std::string& item) { +void bloom_filter_alloc::update(const std::string& item) { if (item.empty()) return; const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_); const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0); From fbc311931f4cc74063df60b6e2896e4de4d28aa7 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Fri, 16 Aug 2024 00:43:56 -0700 Subject: [PATCH 28/30] move builder class inside bloom_filter --- filters/include/bloom_filter.hpp | 240 +++++++++--------- filters/include/bloom_filter_builder_impl.hpp | 30 +-- filters/test/bloom_filter_allocation_test.cpp | 5 +- filters/test/bloom_filter_test.cpp | 44 ++-- 4 files changed, 160 insertions(+), 159 deletions(-) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index a38d0413..b6b4b960 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -35,125 +35,6 @@ template class bloom_filter_builder_alloc; // aliases with default allocator using bloom_filter = bloom_filter_alloc>; -using bloom_filter_builder = bloom_filter_builder_alloc>; - -/** - *

This class provides methods to help estimate the correct parameters when - * creating a Bloom filter, and methods to create the filter using those values.

- * - *

The underlying math is described in the - * - * Wikipedia article on Bloom filters.

- */ -template> -class bloom_filter_builder_alloc { -public: - /** - * Returns the optimal number of hash functions to given target numbers of distinct items - * and the Bloom filter size in bits. This function will provide a result even if the input - * values exceed the capacity of a single Bloom filter. - * @param max_distinct_items The maximum expected number of distinct items to add to the filter - * @param num_filter_bits The intended size of the Bloom Filter in bits - * @return The suggested number of hash functions to use with the filter - */ - static uint16_t suggest_num_hashes(uint64_t max_distinct_items, uint64_t num_filter_bits); - - /** - * Returns the optimal number of hash functions to achieve a target false positive probability. - * @param target_false_positive_prob A desired false positive probability per item - * @return The suggested number of hash functions to use with the filter. - */ - static uint16_t suggest_num_hashes(double target_false_positive_prob); - - /** - * Returns the optimal number of bits to use in a Bloom filter given a target number of distinct - * items and a target false positive probability. - * @param max_distinct_items The maximum expected number of distinct items to add to the filter - * @param target_false_positive_prob A desired false positive probability per item - * @return The suggested number of bits to use with the filter - */ - static uint64_t suggest_num_filter_bits(uint64_t max_distinct_items, double target_false_positive_prob); - - /** - * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, - * using a random base seed for the hash function. - * @param max_distinct_items The maximum expected number of distinct items to add to the filter - * @param target_false_positive_prob A desired false positive probability per item - * @param seed A bash hash seed (default: random) - * @param allocator The allocator to use for the filter (default: standard allocator) - * @return A new Bloom filter configured for the given input parameters - */ - static bloom_filter_alloc create_by_accuracy(uint64_t max_distinct_items, - double target_false_positive_prob, - uint64_t seed = generate_random_seed(), - const Allocator& allocator = Allocator()); - - /** - * Creates a Bloom filter with given number of bits and number of hash functions, - * using the provided base seed for the hash function. - * - * @param num_bits The size of the BloomFilter, in bits - * @param num_hashes The number of hash functions to apply to items - * @param seed A base hash seed (default: random) - * @param allocator The allocator to use for the filter (default: standard allocator) - * @return A new Bloom filter configured for the given input parameters - */ - static bloom_filter_alloc create_by_size(uint64_t num_bits, - uint16_t num_hashes, - uint64_t seed = generate_random_seed(), - const Allocator& allocator = Allocator()); - - /** - * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, - * using a random base seed for the hash function and writing into the provided memory. The filter does - * not take ownership of the memory but does overwrite the full contents. - * - * @param memory A pointer to the memory to use for the filter - * @param length_bytes The length of the memory in bytes - * @param max_distinct_items The maximum expected number of distinct items to add to the filter - * @param target_false_positive_prob A desired false positive probability per item - * @param dstMem A WritableMemory to hold the initialized filter - * @param allocator The allocator to use for the filter (default: standard allocator) - * @return A new Bloom filter configured for the given input parameters in the provided memory - */ - static bloom_filter_alloc initialize_by_accuracy(void* memory, - size_t length_bytes, - uint64_t max_distinct_items, - double target_false_positive_prob, - uint64_t seed = generate_random_seed(), - const Allocator& allocator = Allocator()); - - /** - * Initializes a Bloom filter with given number of bits and number of hash functions, - * using the provided base seed for the hash function and writing into the provided memory. The filter does - * not take ownership of the memory but does overwrite the full contents. - * - * @param memory A pointer to the memory to use for the filter - * @param length_bytes The length of the memory in bytes - * @param num_bits The size of the BloomFilter, in bits - * @param num_hashes The number of hash functions to apply to items - * @param seed A base hash seed (default: random) - * @param allocator The allocator to use for the filter (default: standard allocator) - * @return A new BloomFilter configured for the given input parameters - */ - static bloom_filter_alloc initialize_by_size(void* memory, - size_t length_bytes, - uint64_t num_bits, - uint16_t num_hashes, - uint64_t seed = generate_random_seed(), - const Allocator& allocator = Allocator()); - - /** - * @brief Generates a random 64-bit seed value - * - * @return uint64_t a random value over the range of unsigned 64-bit integers - */ - static uint64_t generate_random_seed(); - -private: - static void validate_size_inputs(uint64_t num_bits, uint16_t num_hashes); - static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob); -}; /** *

A Bloom filter is a data structure that can be used for probabilistic @@ -184,6 +65,9 @@ template> class bloom_filter_alloc { public: + // no public constructor; use builder or deserialize/wrap methods + class builder; + /** * This method deserializes a Bloom filter from a given array of bytes. * @param bytes pointer to the array of bytes @@ -746,6 +630,124 @@ class bloom_filter_alloc { friend class bloom_filter_builder_alloc; }; +/** + *

This class provides methods to help estimate the correct parameters when + * creating a Bloom filter, and methods to create the filter using those values.

+ * + *

The underlying math is described in the + * + * Wikipedia article on Bloom filters.

+ */ +template +class bloom_filter_alloc::builder { +public: + /** + * Returns the optimal number of hash functions to given target numbers of distinct items + * and the Bloom filter size in bits. This function will provide a result even if the input + * values exceed the capacity of a single Bloom filter. + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param num_filter_bits The intended size of the Bloom Filter in bits + * @return The suggested number of hash functions to use with the filter + */ + static uint16_t suggest_num_hashes(uint64_t max_distinct_items, uint64_t num_filter_bits); + + /** + * Returns the optimal number of hash functions to achieve a target false positive probability. + * @param target_false_positive_prob A desired false positive probability per item + * @return The suggested number of hash functions to use with the filter. + */ + static uint16_t suggest_num_hashes(double target_false_positive_prob); + + /** + * Returns the optimal number of bits to use in a Bloom filter given a target number of distinct + * items and a target false positive probability. + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param target_false_positive_prob A desired false positive probability per item + * @return The suggested number of bits to use with the filter + */ + static uint64_t suggest_num_filter_bits(uint64_t max_distinct_items, double target_false_positive_prob); + + /** + * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, + * using a random base seed for the hash function. + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param target_false_positive_prob A desired false positive probability per item + * @param seed A bash hash seed (default: random) + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new Bloom filter configured for the given input parameters + */ + static bloom_filter_alloc create_by_accuracy(uint64_t max_distinct_items, + double target_false_positive_prob, + uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + /** + * Creates a Bloom filter with given number of bits and number of hash functions, + * using the provided base seed for the hash function. + * + * @param num_bits The size of the BloomFilter, in bits + * @param num_hashes The number of hash functions to apply to items + * @param seed A base hash seed (default: random) + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new Bloom filter configured for the given input parameters + */ + static bloom_filter_alloc create_by_size(uint64_t num_bits, + uint16_t num_hashes, + uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + /** + * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, + * using a random base seed for the hash function and writing into the provided memory. The filter does + * not take ownership of the memory but does overwrite the full contents. + * + * @param memory A pointer to the memory to use for the filter + * @param length_bytes The length of the memory in bytes + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param target_false_positive_prob A desired false positive probability per item + * @param dstMem A WritableMemory to hold the initialized filter + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new Bloom filter configured for the given input parameters in the provided memory + */ + static bloom_filter_alloc initialize_by_accuracy(void* memory, + size_t length_bytes, + uint64_t max_distinct_items, + double target_false_positive_prob, + uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + /** + * Initializes a Bloom filter with given number of bits and number of hash functions, + * using the provided base seed for the hash function and writing into the provided memory. The filter does + * not take ownership of the memory but does overwrite the full contents. + * + * @param memory A pointer to the memory to use for the filter + * @param length_bytes The length of the memory in bytes + * @param num_bits The size of the BloomFilter, in bits + * @param num_hashes The number of hash functions to apply to items + * @param seed A base hash seed (default: random) + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new BloomFilter configured for the given input parameters + */ + static bloom_filter_alloc initialize_by_size(void* memory, + size_t length_bytes, + uint64_t num_bits, + uint16_t num_hashes, + uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + /** + * @brief Generates a random 64-bit seed value + * + * @return uint64_t a random value over the range of unsigned 64-bit integers + */ + static uint64_t generate_random_seed(); + +private: + static void validate_size_inputs(uint64_t num_bits, uint16_t num_hashes); + static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob); +}; + } // namespace datasketches #include "bloom_filter_builder_impl.hpp" diff --git a/filters/include/bloom_filter_builder_impl.hpp b/filters/include/bloom_filter_builder_impl.hpp index 26bd2634..a414b9b7 100644 --- a/filters/include/bloom_filter_builder_impl.hpp +++ b/filters/include/bloom_filter_builder_impl.hpp @@ -30,13 +30,13 @@ namespace datasketches { template -uint64_t bloom_filter_builder_alloc::generate_random_seed() { +uint64_t bloom_filter_alloc::builder::generate_random_seed() { return random_utils::next_uint64(random_utils::rand); } template -uint16_t bloom_filter_builder_alloc::suggest_num_hashes(uint64_t max_distinct_items, - uint64_t num_filter_bits) { +uint16_t bloom_filter_alloc::builder::suggest_num_hashes(uint64_t max_distinct_items, + uint64_t num_filter_bits) { if (max_distinct_items == 0) { throw std::invalid_argument("maximum number of distinct items must be strictly positive"); } @@ -49,31 +49,31 @@ uint16_t bloom_filter_builder_alloc::suggest_num_hashes(uint64_t max_distinct } template -uint16_t bloom_filter_builder_alloc::suggest_num_hashes(double target_false_positive_prob) { +uint16_t bloom_filter_alloc::builder::suggest_num_hashes(double target_false_positive_prob) { validate_accuracy_inputs(100, target_false_positive_prob); // max_distinct_items is an arbitrary valid value return static_cast(std::ceil(-log(target_false_positive_prob) / log(2.0))); } template -uint64_t bloom_filter_builder_alloc::suggest_num_filter_bits(uint64_t max_distinct_items, +uint64_t bloom_filter_alloc::builder::suggest_num_filter_bits(uint64_t max_distinct_items, double target_false_positive_prob) { validate_accuracy_inputs(max_distinct_items, target_false_positive_prob); return static_cast(std::ceil(-static_cast(max_distinct_items) * log(target_false_positive_prob) / (log(2.0) * log(2.0)))); } template -bloom_filter_alloc bloom_filter_builder_alloc::create_by_accuracy(uint64_t max_distinct_items, +bloom_filter_alloc bloom_filter_alloc::builder::create_by_accuracy(uint64_t max_distinct_items, double target_false_positive_prob, uint64_t seed, const A& allocator) { validate_accuracy_inputs(max_distinct_items, target_false_positive_prob); - const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob); - const uint16_t num_hashes = bloom_filter_builder_alloc::suggest_num_hashes(target_false_positive_prob); + const uint64_t num_filter_bits = bloom_filter_alloc::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob); + const uint16_t num_hashes = bloom_filter_alloc::builder::suggest_num_hashes(target_false_positive_prob); return bloom_filter_alloc(num_filter_bits, num_hashes, seed, allocator); } template -bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(uint64_t num_bits, +bloom_filter_alloc bloom_filter_alloc::builder::create_by_size(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator) { @@ -82,20 +82,20 @@ bloom_filter_alloc bloom_filter_builder_alloc::create_by_size(uint64_t num } template -bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_accuracy(void* memory, +bloom_filter_alloc bloom_filter_alloc::builder::initialize_by_accuracy(void* memory, size_t length_bytes, uint64_t max_distinct_items, double target_false_positive_prob, uint64_t seed, const A& allocator) { validate_accuracy_inputs(max_distinct_items, target_false_positive_prob); - const uint64_t num_filter_bits = bloom_filter_builder_alloc::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob); - const uint16_t num_hashes = bloom_filter_builder_alloc::suggest_num_hashes(target_false_positive_prob); + const uint64_t num_filter_bits = bloom_filter_alloc::builder::suggest_num_filter_bits(max_distinct_items, target_false_positive_prob); + const uint16_t num_hashes = bloom_filter_alloc::builder::suggest_num_hashes(target_false_positive_prob); return bloom_filter_alloc(static_cast(memory), length_bytes, num_filter_bits, num_hashes, seed, allocator); } template -bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_size(void* memory, +bloom_filter_alloc bloom_filter_alloc::builder::initialize_by_size(void* memory, size_t length_bytes, uint64_t num_bits, uint16_t num_hashes, @@ -106,7 +106,7 @@ bloom_filter_alloc bloom_filter_builder_alloc::initialize_by_size(void* me } template -void bloom_filter_builder_alloc::validate_size_inputs(uint64_t num_bits, uint16_t num_hashes) { +void bloom_filter_alloc::builder::validate_size_inputs(uint64_t num_bits, uint16_t num_hashes) { if (num_bits == 0) { throw std::invalid_argument("number of bits in the filter must be strictly positive"); } else if (num_bits > bloom_filter_alloc::MAX_FILTER_SIZE_BITS) { @@ -118,7 +118,7 @@ void bloom_filter_builder_alloc::validate_size_inputs(uint64_t num_bits, uint } template -void bloom_filter_builder_alloc::validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob) { +void bloom_filter_alloc::builder::validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob) { if (max_distinct_items == 0) { throw std::invalid_argument("maximum number of distinct items must be strictly positive"); } diff --git a/filters/test/bloom_filter_allocation_test.cpp b/filters/test/bloom_filter_allocation_test.cpp index c95f56f8..7f153663 100644 --- a/filters/test/bloom_filter_allocation_test.cpp +++ b/filters/test/bloom_filter_allocation_test.cpp @@ -27,7 +27,6 @@ namespace datasketches { -using bloom_filter_builder_test_alloc = bloom_filter_builder_alloc>; using bloom_filter_test_alloc = bloom_filter_alloc>; using alloc = test_allocator; @@ -37,8 +36,8 @@ TEST_CASE("bloom filter allocation test", "[bloom_filter][test_type]") { { int64_t num_items = 10000; double fpp = 0.01; - uint64_t seed = bloom_filter_builder_test_alloc::generate_random_seed(); - auto bf1 = bloom_filter_builder_test_alloc::create_by_accuracy(num_items, + uint64_t seed = bloom_filter_test_alloc::builder::generate_random_seed(); + auto bf1 = bloom_filter_test_alloc::builder::create_by_accuracy(num_items, fpp, seed, alloc(0)); diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index a1e57d1f..eb7157b1 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -30,20 +30,20 @@ static std::string testBinaryInputPath = "test/"; namespace datasketches { TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") { - REQUIRE_THROWS_AS(bloom_filter_builder::create_by_size(0, 4), std::invalid_argument); - REQUIRE_THROWS_AS(bloom_filter_builder::create_by_size(1L << 60, 4), std::invalid_argument); - REQUIRE_THROWS_AS(bloom_filter_builder::create_by_size(65535, 0), std::invalid_argument); + REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument); + REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument); + REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument); } TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") { uint64_t num_items = 4000; double fpp = 0.01; - uint64_t num_bits = bloom_filter_builder::suggest_num_filter_bits(num_items, fpp); - uint16_t num_hashes = bloom_filter_builder::suggest_num_hashes(num_items, num_bits); + uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp); + uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits); uint64_t seed = 89023; - auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes, seed); + auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed); uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64 REQUIRE(bf.get_capacity() == adjusted_num_bits); REQUIRE(bf.get_num_hashes() == num_hashes); @@ -51,7 +51,7 @@ TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") { REQUIRE(bf.is_empty()); // should match above - bf = bloom_filter_builder::create_by_accuracy(num_items, fpp, seed); + bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed); REQUIRE(bf.get_capacity() == adjusted_num_bits); REQUIRE(bf.get_num_hashes() == num_hashes); REQUIRE(bf.get_seed() == seed); @@ -61,13 +61,13 @@ TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") { size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits); uint8_t* bytes = new uint8_t[serialized_size_bytes]; - bf = bloom_filter_builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed); + bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed); REQUIRE(bf.get_capacity() == adjusted_num_bits); REQUIRE(bf.get_num_hashes() == num_hashes); REQUIRE(bf.get_seed() == seed); REQUIRE(bf.is_empty()); - bf = bloom_filter_builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed); + bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed); REQUIRE(bf.get_capacity() == adjusted_num_bits); REQUIRE(bf.get_num_hashes() == num_hashes); REQUIRE(bf.get_seed() == seed); @@ -80,7 +80,7 @@ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") { uint64_t num_items = 5000; double fpp = 0.01; - auto bf = bloom_filter_builder::create_by_accuracy(num_items, fpp); + auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp); REQUIRE(bf.is_empty()); REQUIRE(bf.get_bits_used() == 0); @@ -105,7 +105,7 @@ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") { // initialize in memory and run the same tests // also checking against the results from the first part uint8_t* bf_memory = new uint8_t[bytes.size()]; - auto bf2 = bloom_filter_builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed()); + auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed()); REQUIRE(bf2.is_empty()); REQUIRE(bf2.get_bits_used() == 0); @@ -153,7 +153,7 @@ TEST_CASE("bloom_filter: inversion", "[bloom_filter]") { uint64_t num_bits = 8192; uint16_t num_hashes = 3; - auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes); + auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes); uint64_t n = 500; for (uint64_t i = 0; i < n; ++i) { @@ -186,18 +186,18 @@ TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") { uint64_t num_bits = 32768; uint16_t num_hashes = 4; - auto bf1 = bloom_filter_builder::create_by_size(num_bits, num_hashes); + auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes); // mismatched num bits - auto bf2 = bloom_filter_builder::create_by_size(2 * num_bits, num_hashes); + auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes); REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument); // mismatched num hashes - auto bf3 = bloom_filter_builder::create_by_size(num_bits, 2 * num_hashes); + auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes); REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument); // mismatched seed - auto bf4 = bloom_filter_builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1); + auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1); REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument); } @@ -205,8 +205,8 @@ TEST_CASE("bloom_filter: basic union", "[bloom_filter]") { const uint64_t num_bits = 12288; const uint16_t num_hashes = 4; - auto bf1 = bloom_filter_builder::create_by_size(num_bits, num_hashes); - auto bf2 = bloom_filter_builder::create_by_size(num_bits, num_hashes, bf1.get_seed()); + auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes); + auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed()); const uint64_t n = 1000; const uint32_t max_item = 3 * n / 2 - 1; @@ -233,8 +233,8 @@ TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") { const uint64_t num_bits = 8192; const uint16_t num_hahes = 5; - auto bf1 = bloom_filter_builder::create_by_size(num_bits, num_hahes); - auto bf2 = bloom_filter_builder::create_by_size(num_bits, num_hahes, bf1.get_seed()); + auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes); + auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed()); const uint64_t n = 1024; const uint32_t max_item = 3 * n / 2 - 1; @@ -268,7 +268,7 @@ TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") { const uint64_t num_bits = 32769; const uint16_t num_hashes = 7; - auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes); + auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes); auto bytes = bf.serialize(); REQUIRE(bytes.size() == bf.get_serialized_size_bytes()); @@ -301,7 +301,7 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { const uint64_t num_bits = 32768; const uint16_t num_hashes = 5; - auto bf = bloom_filter_builder::create_by_size(num_bits, num_hashes); + auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes); const uint64_t n = 1000; for (uint64_t i = 0; i < n; ++i) { bf.update(0.5 + i); // testing floats From 822bd53ae424b846046106012e1025e429fdbc46 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Fri, 16 Aug 2024 21:40:53 -0700 Subject: [PATCH 29/30] Remove unused and unnecessary forward declaration --- filters/include/bloom_filter.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index b6b4b960..d4e07416 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -31,7 +31,6 @@ namespace datasketches { // forward declarations template class bloom_filter_alloc; -template class bloom_filter_builder_alloc; // aliases with default allocator using bloom_filter = bloom_filter_alloc>; From 37f65316192e2af0a53824a6ab0037257c0b8830 Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Fri, 16 Aug 2024 21:58:58 -0700 Subject: [PATCH 30/30] no need for friend class with builder inside the filter --- filters/include/bloom_filter.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index d4e07416..95420684 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -625,8 +625,6 @@ class bloom_filter_alloc { uint64_t num_bits_set_; uint8_t* bit_array_; // data backing bit_array_, regardless of ownership uint8_t* memory_; // if wrapped, pointer to the start of the filter, otheriwse nullptr - - friend class bloom_filter_builder_alloc; }; /**