Skip to content

Commit

Permalink
More work on plumbing attribute accumulation through (#263)
Browse files Browse the repository at this point in the history
* Plumb bounding boxes through potential intersections

* Quick bbox reject for bins that can't possibly intersect

* Inching toward attribute accumulation in megatile handling

* Some sort of test for how all these things interact with each other.

Automatic numeric attribute accumulation does *not* apply to attributes
that have an explicit attribute accumulator set, because the order of
operations is too messy and weird

* More sketching

* More sketching

* Actually do some accumulation

* Put all that behind an --accumulate-numeric flag

* Use the same attribute accumulation logic in binning as in megatiles

* Fix backwards conditional

* Add means, but somehow I have some counts of 0

* Handle aggregated attributes with no base attribute in the feature

* Checkpoint before I break everything

* Found a flaw, now to debug

* Fix a typo that broke accumulation

* Add binning tests

* Make sure IDs make it through on the bins

* Fix count/mean accumulation

* Make the numeric accumulation prefix configurable

* Make sure the accumulate test still works with a different prefix

* Forgot to update this test

* More testing to make sure cluster sizes make it all the way through

* Fix neglected --accumulate-attribute when binning

* Mark unexercised attribute accumulation cases as "can't happen"

* Factor out numeric preservation

* Attrs with the accumulation prefix are just preserved, not accumulated

* Test behavior of prefixed attributes

* Plumbing for exclude and exclude-prefix

* Implement and test attribute prefix stripping in overzoom

* Update version and changelog

* For debugging, make an attribute list of source feature IDs

* Revert "For debugging, make an attribute list of source feature IDs"

This reverts commit 65fc99c.
  • Loading branch information
e-n-f authored Sep 20, 2024
1 parent 5342428 commit c5f2f0d
Show file tree
Hide file tree
Showing 23 changed files with 2,387 additions and 3,920 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 2.62.3

* Summary statistics with --accumulate-numeric-attributes make it from tiling through to binning
* Prefix can be specified for --accumulate-numeric-attributes
* Added --exclude and --exclude-prefix to tippecanoe-overzoom

# 2.62.2

* Pass feature ID through with bins
Expand Down
102 changes: 98 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ indent:
TESTS = $(wildcard tests/*/out/*.json)
SPACE = $(NULL) $(NULL)

test: tippecanoe tippecanoe-decode $(addsuffix .check,$(TESTS)) raw-tiles-test parallel-test pbf-test join-test enumerate-test decode-test join-filter-test unit json-tool-test allow-existing-test csv-test layer-json-test pmtiles-test decode-pmtiles-test overzoom-test
test: tippecanoe tippecanoe-decode $(addsuffix .check,$(TESTS)) raw-tiles-test parallel-test pbf-test join-test enumerate-test decode-test join-filter-test unit json-tool-test allow-existing-test csv-test layer-json-test pmtiles-test decode-pmtiles-test overzoom-test accumulate-test
./unit

suffixes = json json.gz
Expand Down Expand Up @@ -367,15 +367,15 @@ overzoom-test: tippecanoe-overzoom
cmp tests/pbf/bin-11-327-791.pbf.out.json.check tests/pbf/bin-11-327-791.pbf.out.json
rm tests/pbf/bin-11-327-791.pbf.out.json.check tests/pbf/bin-11-327-791.pbf.out
# Binning with longitude wraparound problems
./tippecanoe-overzoom -o tests/pbf/0-0-0-pop-2-0-1.pbf.out --assign-to-bins tests/pbf/h3-2-0-1.geojson tests/pbf/0-0-0.pbf 2/0/1 2/0/1
./tippecanoe-overzoom -o tests/pbf/0-0-0-pop-2-0-1.pbf.out --accumulate-numeric-attributes=tippecanoe --assign-to-bins tests/pbf/h3-2-0-1.geojson tests/pbf/0-0-0.pbf 2/0/1 2/0/1
./tippecanoe-decode tests/pbf/0-0-0-pop-2-0-1.pbf.out 2 0 1 > tests/pbf/0-0-0-pop-2-0-1.pbf.out.json.check
cmp tests/pbf/0-0-0-pop-2-0-1.pbf.out.json.check tests/pbf/0-0-0-pop-2-0-1.pbf.out.json
rm tests/pbf/0-0-0-pop-2-0-1.pbf.out tests/pbf/0-0-0-pop-2-0-1.pbf.out.json.check
./tippecanoe-overzoom -o tests/pbf/0-0-0-pop-1-1-0.pbf.out --assign-to-bins tests/pbf/h3-1-1-0.geojson tests/pbf/0-0-0.pbf 1/1/0 1/1/0
./tippecanoe-overzoom -o tests/pbf/0-0-0-pop-1-1-0.pbf.out --accumulate-numeric-attributes=tippecanoe --assign-to-bins tests/pbf/h3-1-1-0.geojson tests/pbf/0-0-0.pbf 1/1/0 1/1/0
./tippecanoe-decode tests/pbf/0-0-0-pop-1-1-0.pbf.out 1 1 0 > tests/pbf/0-0-0-pop-1-1-0.pbf.out.json.check
cmp tests/pbf/0-0-0-pop-1-1-0.pbf.out.json.check tests/pbf/0-0-0-pop-1-1-0.pbf.out.json
rm tests/pbf/0-0-0-pop-1-1-0.pbf.out tests/pbf/0-0-0-pop-1-1-0.pbf.out.json.check
./tippecanoe-overzoom -o tests/pbf/0-0-0-pop-0-0-0.pbf.out --assign-to-bins tests/pbf/h3-0-0-0.geojson tests/pbf/0-0-0.pbf 0/0/0 0/0/0
./tippecanoe-overzoom -o tests/pbf/0-0-0-pop-0-0-0.pbf.out --accumulate-numeric-attributes=tippecanoe --assign-to-bins tests/pbf/h3-0-0-0.geojson tests/pbf/0-0-0.pbf 0/0/0 0/0/0
./tippecanoe-decode tests/pbf/0-0-0-pop-0-0-0.pbf.out 0 0 0 > tests/pbf/0-0-0-pop-0-0-0.pbf.out.json.check
cmp tests/pbf/0-0-0-pop-0-0-0.pbf.out.json.check tests/pbf/0-0-0-pop-0-0-0.pbf.out.json
rm tests/pbf/0-0-0-pop-0-0-0.pbf.out tests/pbf/0-0-0-pop-0-0-0.pbf.out.json.check
Expand Down Expand Up @@ -515,6 +515,100 @@ join-test: tippecanoe tippecanoe-decode tile-join
cmp tests/ne_110m_ocean/join/joined.mbtiles.json.check tests/ne_110m_ocean/join/joined.mbtiles.json
rm -f tests/ne_110m_ocean/join/ocean.mbtiles tests/ne_110m_ocean/join/countries.mbtiles tests/ne_110m_ocean/join/joined.mbtiles tests/ne_110m_ocean/join/joined.mbtiles.json.check

accumulate-test:
# there are 144 features with POP1950 in the original dataset
test `grep '"POP1950": [0-9]' tests/ne_110m_populated_places_nulls/in.json | wc -l` == 144
# and 99 without it
test `grep '"POP1950": null' tests/ne_110m_populated_places_nulls/in.json | wc -l` == 99
./tippecanoe -yNAME -yPOP1950 -yclustered:cluster_size -yclustered:unrelated -q -z3 -r1.75 -b0 -f -e tests/pbf/accum.dir --accumulate-numeric-attributes=clustered --set-attribute '{"clustered:cluster_size":1}' --accumulate-attribute '{"clustered:cluster_size":"sum"}' --retain-points-multiplier 3 tests/ne_110m_populated_places_nulls/in.json
# at this drop rate, there are 6 points at z0 that have no POP1950s clustered onto them....
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep -v 'clustered:count:POP1950' | wc -l` == 78
# 35 of which have no POP1950 at all
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep -v 'POP1950' | wc -l` == 35
# 43 of which do have POP1950
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep -v 'clustered:count:POP1950' | grep 'POP1950' | wc -l` == 43
# plus 59 that are clustered
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep 'clustered:count:POP1950' | wc -l` == 59
# the 59 clustered POP1950s have a total count of 101
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep 'clustered:count:POP1950' | sed 's/.*"clustered:count:POP1950": //' | awk '{sum += $$1} END {print sum}'` == 101
# we have already established that there are 43 bare POP1950s
# which makes a total of 144, which is the total count expected
#
# meanwhile, regular attribute accumulation.
# there are 137 features in the z0 tile, and they all have clustered:cluster_size
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep 'clustered:cluster_size' | wc -l` == 137
# there are no features that lack it.
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep -v 'clustered:cluster_size' | wc -l` == 0
# they add up to the 243 original features
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | sed 's/.*clustered:cluster_size": //' | awk '{sum += $$1} END {print sum}'` == 243
# Make sure we do *not* accumulate a numeric attribute that already has the magic prefix:
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep sum:clustered:unrelated | wc -l` == 0
# But that we *do* preserve those attributes into the output features:
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep clustered:unrelated | wc -l` == 66
#
# on to the sums:
# in the original data set, the POP1950s that are present add up to 161590
test `grep '"POP1950": [0-9]' tests/ne_110m_populated_places_nulls/in.json | sed 's/.*"POP1950": //' | awk '{sum += $$1} END {print sum}' ` == 161590
# in the z0 tile, the clustered POP1950s add up to 113357
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep 'clustered:sum:POP1950' | sed 's/.*"clustered:sum:POP1950": //' | awk '{sum += $$1} END {print sum}'` == 113357
# and the non-clustered ones add up to 48233
test `./tippecanoe-decode -c tests/pbf/accum.dir/0/0/0.pbf 0 0 0 | grep -v 'clustered:sum:POP1950' | grep POP1950 | sed 's/.*"POP1950": //' | awk '{sum += $$1} END {print sum}'` == 48233
# which is the correct 161590
#
# OK, so do these still hold after megatile filtering?
./tippecanoe-overzoom --accumulate-numeric-attributes=clustered --accumulate-attribute '{"clustered:cluster_size":"sum"}' -m -o tests/pbf/accum-0-0-0.pbf tests/pbf/accum.dir/0/0/0.pbf 0/0/0 0/0/0
# Now there are 40 features with POP1950 clusters
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep 'clustered:count:POP1950' | wc -l` == 40
# There are 4 with bare POP1950
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep -v 'clustered:count:POP1950' | grep 'POP1950' | wc -l` == 4
# And 2 with no POP1950 at all
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep -v 'POP1950' | wc -l` == 2
# (which is the same as you get if you don't use -retain-points-multiplier when creating the tileset)
#
# the clustered and megatile-filtered POP1950s add up to 146370
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep 'clustered:sum:POP1950' | sed 's/.*"clustered:sum:POP1950": //' | awk '{sum += $$1} END {print sum}'` == 146370
# the non-clustered but megatile-filtered POP1950s add up to 15220
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep -v 'clustered:sum:POP1950' | grep POP1950 | sed 's/.*"POP1950": //' | awk '{sum += $$1} END {print sum}'` == 15220
# which add up to 161590 so we have the right global total
# Make sure we do *not* accumulate a numeric attribute that already has the magic prefix:
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep sum:clustered:unrelated | wc -l` == 0
# But that we *do* preserve those attributes into the output features:
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep clustered:unrelated | wc -l` == 22
# the cluster sizes still add up to the 243 original features
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | sed 's/.*clustered:cluster_size": //' | awk '{sum += $$1} END {print sum}'` == 243
#
# We actually want to serve point tiles without the numeric accumulations,
# but with cluster size, so test that combination:
./tippecanoe-overzoom --accumulate-attribute '{"clustered:cluster_size":"sum"}' --exclude-prefix clustered:sum --exclude-prefix clustered:count --exclude-prefix clustered:min --exclude-prefix clustered:max --exclude-prefix clustered:mean -m -o tests/pbf/accum-0-0-0.pbf tests/pbf/accum.dir/0/0/0.pbf 0/0/0 0/0/0
# There are no POP1950 clusters
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep 'clustered:count:POP1950' | wc -l` == 0
# But there are still 28 with bare POP1950
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep -v 'clustered:count:POP1950' | grep 'POP1950' | wc -l` == 28
# And 18 with no POP1950 at all
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | grep -v 'POP1950' | wc -l` == 18
# which matches the 46 features that you get if you tile without --retain-points-multiplier.
# the cluster sizes still add up to the 243 original features
test `./tippecanoe-decode -c tests/pbf/accum-0-0-0.pbf 0 0 0 | sed 's/.*clustered:cluster_size": //' | awk '{sum += $$1} END {print sum}'` == 243
#
# Now on to binning!
./tippecanoe-overzoom --assign-to-bins tests/pbf/h3-0-0-0.geojson --accumulate-numeric-attributes=clustered --accumulate-attribute '{"clustered:cluster_size":"sum"}' -o tests/pbf/bins-0-0-0.pbf tests/pbf/accum.dir/0/0/0.pbf 0/0/0 0/0/0
# Now there are 30 bins with POP1950 clusters
test `./tippecanoe-decode -c tests/pbf/bins-0-0-0.pbf 0 0 0 | grep 'clustered:count:POP1950' | wc -l` == 44
# There are none with bare POP1950 (which is expected; we should only have summary statistics)
test `./tippecanoe-decode -c tests/pbf/bins-0-0-0.pbf 0 0 0 | grep -v 'clustered:count:POP1950' | grep 'POP1950' | wc -l` == 0
# And 4 with no POP1950 at all
test `./tippecanoe-decode -c tests/pbf/bins-0-0-0.pbf 0 0 0 | grep -v 'POP1950' | wc -l` == 4
#
# the clustered and megatile-filtered and binned POP1950s add up to 161590
test `./tippecanoe-decode -c tests/pbf/bins-0-0-0.pbf 0 0 0 | grep 'clustered:sum:POP1950' | sed 's/.*"clustered:sum:POP1950": //' | awk '{sum += $$1} END {print sum}'` == 161590
# which is the right global total
# Make sure we do *not* accumulate a numeric attribute that already has the magic prefix:
test `./tippecanoe-decode -c tests/pbf/bins-0-0-0.pbf 0 0 0 | grep sum:clustered:unrelated | wc -l` == 0
# And those attributes do *not* make it onto the bins
test `./tippecanoe-decode -c tests/pbf/bins-0-0-0.pbf 0 0 0 | grep clustered:unrelated | wc -l` == 0
# the cluster sizes still add up to the 243 original features
test `./tippecanoe-decode -c tests/pbf/bins-0-0-0.pbf 0 0 0 | sed 's/.*clustered:cluster_size": //' | awk '{sum += $$1} END {print sum}'` == 243

join-filter-test: tippecanoe tippecanoe-decode tile-join
# Comes out different from the direct tippecanoe run because null attributes are lost
./tippecanoe -q -z0 -f -o tests/feature-filter/out/all.mbtiles tests/feature-filter/in.json
Expand Down
60 changes: 52 additions & 8 deletions attribute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
#include "jsonpull/jsonpull.h"
#include "milo/dtoa_milo.h"

std::map<std::string, attribute_op> numeric_operations = {
{"sum", op_sum},
{"min", op_min},
{"max", op_max},
{"count", op_count},
};

void set_attribute_accum(std::unordered_map<std::string, attribute_op> &attribute_accum, std::string name, std::string type) {
attribute_op t;

Expand Down Expand Up @@ -88,12 +95,12 @@ void preserve_attribute(attribute_op const &op, std::string const &key, serial_v
case op_sum:
full_values[i].s = milo::dtoa_milo(atof(full_values[i].s.c_str()) + atof(val.s.c_str()));
full_values[i].type = mvt_double;
break;
return;

case op_product:
full_values[i].s = milo::dtoa_milo(atof(full_values[i].s.c_str()) * atof(val.s.c_str()));
full_values[i].type = mvt_double;
break;
return;

case op_max: {
double existing = atof(full_values[i].s.c_str());
Expand All @@ -102,7 +109,7 @@ void preserve_attribute(attribute_op const &op, std::string const &key, serial_v
full_values[i].s = val.s.c_str();
full_values[i].type = mvt_double;
}
break;
return;
}

case op_min: {
Expand All @@ -112,7 +119,7 @@ void preserve_attribute(attribute_op const &op, std::string const &key, serial_v
full_values[i].s = val.s.c_str();
full_values[i].type = mvt_double;
}
break;
return;
}

case op_mean: {
Expand All @@ -124,24 +131,26 @@ void preserve_attribute(attribute_op const &op, std::string const &key, serial_v
attribute_accum_state.insert(std::pair<std::string, accum_state>(key, s));

full_values[i].s = milo::dtoa_milo(s.sum / s.count);
full_values[i].type = mvt_double;
} else {
state->second.sum += atof(val.s.c_str());
state->second.count += 1;

full_values[i].s = milo::dtoa_milo(state->second.sum / state->second.count);
full_values[i].type = mvt_double;
}
break;
return;
}

case op_concat:
full_values[i].s += val.s;
full_values[i].type = mvt_string;
break;
return;

case op_comma:
full_values[i].s += std::string(",") + val.s;
full_values[i].type = mvt_string;
break;
return;

case op_count: {
auto state = attribute_accum_state.find(key);
Expand All @@ -155,9 +164,44 @@ void preserve_attribute(attribute_op const &op, std::string const &key, serial_v
state->second.count += 1;
full_values[i].s = std::to_string(state->second.count);
}
break;
return;
}
}
}
}

// not found, so we are making a new value

serial_val sv;
switch (op) {
case op_sum:
case op_max:
case op_min:
sv.s = val.s;
sv.type = mvt_double;
break;

case op_count: {
auto state = attribute_accum_state.find(key);
if (state == attribute_accum_state.end()) { // not already present
accum_state s;
s.count = 1;
attribute_accum_state.insert(std::pair<std::string, accum_state>(key, s));

sv.s = std::to_string(s.count);
} else { // already present, incrementing
fprintf(stderr, "preserve_attribute: can't happen (count)\n");
exit(EXIT_IMPOSSIBLE);
}
sv.type = mvt_double;
break;
}

default:
fprintf(stderr, "can't happen: operation that isn't used by --accumulate-numeric-attributes\n");
exit(EXIT_IMPOSSIBLE);
}

full_keys.push_back(key);
full_values.push_back(sv);
}
3 changes: 3 additions & 0 deletions attribute.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <vector>
#include <unordered_map>
#include <map>

enum attribute_op {
op_sum,
Expand All @@ -26,4 +27,6 @@ void set_attribute_accum(std::unordered_map<std::string, attribute_op> &attribut
void set_attribute_accum(std::unordered_map<std::string, attribute_op> &attribute_accum, const char *arg, char **argv);
void preserve_attribute(attribute_op const &op, const std::string &key, serial_val const &val, std::vector<std::string> &full_keys, std::vector<serial_val> &full_values, std::unordered_map<std::string, accum_state> &attribute_accum_state);

extern std::map<std::string, attribute_op> numeric_operations;

#endif
Loading

0 comments on commit c5f2f0d

Please sign in to comment.