From af8e2b9eb70e88983b430d78ea4209691dd77731 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Fri, 10 May 2024 15:28:35 -0700 Subject: [PATCH] implemented filter --- tuple/include/tuple_sketch.hpp | 29 +++++++++++++- tuple/include/tuple_sketch_impl.hpp | 33 ++++++++++++++++ tuple/test/tuple_sketch_test.cpp | 61 +++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp index 383e5736..cbfd9f11 100644 --- a/tuple/include/tuple_sketch.hpp +++ b/tuple/include/tuple_sketch.hpp @@ -381,6 +381,15 @@ class update_tuple_sketch: public tuple_sketch { */ compact_tuple_sketch compact(bool ordered = true) const; + /** + * Produces a Compact Tuple sketch from this sketch + * by applying a given predicate to each entry. + * @param predicate should return true for the entries to keep + * @return compact sketch with the entries retained according to the predicate + */ + template + compact_tuple_sketch filter(const Predicate& predicate) const; + virtual iterator begin(); virtual iterator end(); virtual const_iterator begin() const; @@ -480,6 +489,25 @@ class compact_tuple_sketch: public tuple_sketch { virtual uint32_t get_num_retained() const; virtual uint16_t get_seed_hash() const; + /** + * Produces a Compact Tuple sketch from this sketch + * by applying a given predicate to each entry. + * @param predicate should return true for the entries to keep + * @return compact sketch with the entries retained according to the predicate + */ + template + compact_tuple_sketch filter(const Predicate& predicate) const; + + /** + * Produces a Compact Tuple sketch from a given sketch (Update or Compact) + * by applying a given predicate to each entry. + * @param sketch input sketch + * @param predicate should return true for the entries to keep + * @return compact sketch with the entries retained according to the predicate + */ + template + static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate); + /** * This method serializes the sketch into a given stream in a binary form * @param os output stream @@ -579,7 +607,6 @@ class compact_tuple_sketch: public tuple_sketch { template friend class theta_intersection_base; template friend class theta_set_difference_base; compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector&& entries); - }; /// Tuple base builder diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp index 0766e4dd..e5bf8340 100644 --- a/tuple/include/tuple_sketch_impl.hpp +++ b/tuple/include/tuple_sketch_impl.hpp @@ -258,6 +258,12 @@ compact_tuple_sketch update_tuple_sketch::compact(bool ordered return compact_tuple_sketch(*this, ordered); } +template +template +compact_tuple_sketch update_tuple_sketch::filter(const Predicate& predicate) const { + return compact_tuple_sketch::filter(*this, predicate); +} + template void update_tuple_sketch::print_specifics(std::ostringstream& os) const { os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl; @@ -344,6 +350,33 @@ uint16_t compact_tuple_sketch::get_seed_hash() const { return seed_hash_; } +template +template +compact_tuple_sketch compact_tuple_sketch::filter(const Predicate& predicate) const { + return filter(*this, predicate); +} + +template +template +compact_tuple_sketch compact_tuple_sketch::filter(const Sketch& sketch, const Predicate& predicate) { + std::vector entries(sketch.get_allocator()); + entries.reserve(sketch.get_num_retained()); + std::copy_if( + sketch.begin(), + sketch.end(), + std::back_inserter(entries), + [&predicate](const Entry& e) {return predicate(e.second);} + ); + entries.shrink_to_fit(); + return compact_tuple_sketch( + !sketch.is_estimation_mode() && entries.empty(), + sketch.is_ordered(), + sketch.get_seed_hash(), + sketch.get_theta64(), + std::move(entries) + ); +} + // implementation for fixed-size arithmetic types (integral and floating point) template template::value, int>::type> diff --git a/tuple/test/tuple_sketch_test.cpp b/tuple/test/tuple_sketch_test.cpp index b5316e80..6e44e284 100644 --- a/tuple/test/tuple_sketch_test.cpp +++ b/tuple/test/tuple_sketch_test.cpp @@ -310,4 +310,65 @@ TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sk REQUIRE(sketch.get_num_retained() == 3); } +TEST_CASE("filter", "[tuple_sketch]") { + auto usk = update_tuple_sketch::builder().build(); + + { // empty update sketch + auto sk = usk.filter([](int){return true;}); + REQUIRE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE(sk.get_num_retained() == 0); + } + + { // empty compact sketch + auto sk = usk.compact().filter([](int){return true;}); + REQUIRE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE(sk.get_num_retained() == 0); + } + + usk.update(1, 1); + usk.update(1, 1); + usk.update(2, 1); + usk.update(2, 1); + usk.update(3, 1); + + { // exact mode update sketch + auto sk = usk.filter([](int v){return v > 1;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE_FALSE(sk.is_ordered()); + REQUIRE_FALSE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } + + { // exact mode compact sketch + auto sk = usk.compact().filter([](int v){return v > 1;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE_FALSE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } + + // only keys 1 and 2 had values of 2, which will become 3 after this update + // some entries are discarded in estimation mode, but these happen to survive + // the process is deterministic, so the test will always work + for (int i = 0; i < 10000; ++i) usk.update(i, 1); + + { // estimation mode update sketch + auto sk = usk.filter([](int v){return v > 2;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE_FALSE(sk.is_ordered()); + REQUIRE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } + + { // estimation mode compact sketch + auto sk = usk.compact().filter([](int v){return v > 2;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } +} + } /* namespace datasketches */