Skip to content

Commit

Permalink
implemented get_quantile(rank)
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexanderSaydakov committed Jan 12, 2024
1 parent 1a9667b commit 1540be6
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 1 deletion.
2 changes: 2 additions & 0 deletions tdigest/include/tdigest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ class tdigest {
void merge_new_values();
void merge_new_values(bool force, uint16_t k);
void merge_new_values(uint16_t k);

static double weighted_average(double x1, double w1, double x2, double w2);
};

} /* namespace datasketches */
Expand Down
50 changes: 49 additions & 1 deletion tdigest/include/tdigest_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,50 @@ double tdigest<T, A>::get_rank(T value) const {
template<typename T, typename A>
T tdigest<T, A>::get_quantile(double rank) const {
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
return 0;
if ((rank < 0.0) || (rank > 1.0)) {
throw std::invalid_argument("Normalized rank cannot be less than 0 or greater than 1");
}
const_cast<tdigest*>(this)->merge_new_values(); // side effect
if (centroids_.size() == 1) return centroids_.front().get_mean();

// at least 2 centroids
const double weight = rank * total_weight_;
if (weight < 1) return min_;
if (weight > total_weight_ - 1.0) return max_;
const double first_weight = centroids_.front().get_weight();
if (first_weight > 1 && weight < first_weight / 2.0) {
return min_ + (weight - 1.0) / (first_weight / 2.0 - 1.0) * (centroids_.front().get_mean() - min_);
}
const double last_weight = centroids_.back().get_weight();
if (last_weight > 1 && total_weight_ - weight <= last_weight / 2.0) {
return max_ + (total_weight_ - weight - 1.0) / (last_weight / 2.0 - 1.0) * (max_ - centroids_.back().get_mean());
}

// interpolate between extremes
double weight_so_far = first_weight / 2.0;
for (size_t i = 0; i < centroids_.size() - 1; ++i) {
const double dw = (centroids_[i].get_weight() + centroids_[i + 1].get_weight()) / 2.0;
if (weight_so_far + dw > weight) {
// the target weight is between centroids i and i+1
double left_weight = 0;
if (centroids_[i].get_weight() == 1) {
if (weight - weight_so_far < 0.5) return centroids_[i].get_mean();
left_weight = 0.5;
}
double right_weight = 0;
if (centroids_[i + 1].get_weight() == 1) {
if (weight_so_far + dw - weight <= 0.5) return centroids_[i + 1].get_mean();
right_weight = 0.5;
}
const double w1 = weight - weight_so_far - left_weight;
const double w2 = weight_so_far + dw - weight - right_weight;
return weighted_average(centroids_[i].get_mean(), w1, centroids_[i + 1].get_mean(), w2);
}
weight_so_far += dw;
}
const double w1 = weight - total_weight_ - centroids_.back().get_weight() / 2.0;
const double w2 = centroids_.back().get_weight() / 2.0 - w1;
return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
}

template<typename T, typename A>
Expand Down Expand Up @@ -272,6 +315,11 @@ void tdigest<T, A>::merge_new_values(uint16_t k) {
buffered_weight_ = 0;
}

template<typename T, typename A>
double tdigest<T, A>::weighted_average(double x1, double w1, double x2, double w2) {
return (x1 * w1 + x2 * w2) / (w1 + w2);
}

} /* namespace datasketches */

#endif // _TDIGEST_IMPL_HPP_
8 changes: 8 additions & 0 deletions tdigest/test/tdigest_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ TEST_CASE("one value", "[tdigest]") {
REQUIRE(td.get_rank(0.99) == 0);
REQUIRE(td.get_rank(1) == 0.5);
REQUIRE(td.get_rank(1.01) == 1);
REQUIRE(td.get_quantile(0) == 1);
REQUIRE(td.get_quantile(0.5) == 1);
REQUIRE(td.get_quantile(1) == 1);
}

TEST_CASE("many values", "[tdigest]") {
Expand All @@ -63,6 +66,11 @@ TEST_CASE("many values", "[tdigest]") {
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
REQUIRE(td.get_rank(n) == 1);
REQUIRE(td.get_quantile(0) == 0);
REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.01));
REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
REQUIRE(td.get_quantile(1) == n - 1);
}

TEST_CASE("rank - two values", "[tdigest]") {
Expand Down

0 comments on commit 1540be6

Please sign in to comment.