From 69c68095b214381277623113252ec845560ee0aa Mon Sep 17 00:00:00 2001
From: Moriarty <22225248+apmoriarty@users.noreply.github.com>
Date: Tue, 31 Oct 2023 11:13:32 -0400
Subject: [PATCH 01/32] TLDEquality refactored to perform an in-line byte
comparison (#2151)
* TLDEquality refactored to perform an in-line byte comparison instead of delegating to expensive TLD utility methods
* Check for edge cases
---
.../datawave/query/function/TLDEquality.java | 22 +++--
.../query/function/TLDEqualityTest.java | 87 ++++++++++---------
2 files changed, 63 insertions(+), 46 deletions(-)
diff --git a/warehouse/query-core/src/main/java/datawave/query/function/TLDEquality.java b/warehouse/query-core/src/main/java/datawave/query/function/TLDEquality.java
index da2ec14fa6..56d32227f9 100644
--- a/warehouse/query-core/src/main/java/datawave/query/function/TLDEquality.java
+++ b/warehouse/query-core/src/main/java/datawave/query/function/TLDEquality.java
@@ -3,11 +3,9 @@
import org.apache.accumulo.core.data.ByteSequence;
import org.apache.accumulo.core.data.Key;
-import datawave.query.tld.TLD;
-
/**
* A key equality implementation that compares to the root pointers of two doc Ids together.
- *
+ *
* For example, two IDs `h1.h2.h3.a.b.c.d` and `h1.h2.h3.e.f` would be considered equal by this check.
*/
public class TLDEquality implements Equality {
@@ -23,8 +21,20 @@ public class TLDEquality implements Equality {
*/
@Override
public boolean partOf(Key key, Key other) {
- ByteSequence docCF = TLD.estimateRootPointerFromId(key.getColumnFamilyData());
- ByteSequence otherCF = TLD.estimateRootPointerFromId(other.getColumnFamilyData());
- return otherCF.equals(docCF);
+ ByteSequence keyCf = key.getColumnFamilyData();
+ ByteSequence otherCf = other.getColumnFamilyData();
+
+ int dotCount = 0;
+ int len = Math.min(keyCf.length(), otherCf.length());
+ for (int i = 0; i < len; i++) {
+ byte a = keyCf.byteAt(i);
+ byte b = otherCf.byteAt(i);
+ if (a != b) {
+ return false;
+ } else if (a == '.' && ++dotCount == 3) {
+ return true;
+ }
+ }
+ return len != 0;
}
}
diff --git a/warehouse/query-core/src/test/java/datawave/query/function/TLDEqualityTest.java b/warehouse/query-core/src/test/java/datawave/query/function/TLDEqualityTest.java
index daa5fc19c7..c170a37ca3 100644
--- a/warehouse/query-core/src/test/java/datawave/query/function/TLDEqualityTest.java
+++ b/warehouse/query-core/src/test/java/datawave/query/function/TLDEqualityTest.java
@@ -1,68 +1,75 @@
package datawave.query.function;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import org.apache.accumulo.core.data.Key;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
-public class TLDEqualityTest {
+class TLDEqualityTest {
+ private final Key key = new Key("row", "datatype\0-7m7uk9.oz9qpy.-nfahrv");
+ private final Key keyChildOne = new Key("row", "datatype\0-7m7uk9.oz9qpy.-nfahrv.123.987");
+ private final Key keyChildTwo = new Key("row", "datatype\0-7m7uk9.oz9qpy.-nfahrv.123.777");
+ private final Key otherKey = new Key("row", "datatype\0-7m7uk9.oz9qpy.-aaazzz");
+ private final Key otherKeyChildOne = new Key("row", "datatype\0-7m7uk9.oz9qpy.-aaazzz.123.987");
- private TLDEquality equality = new TLDEquality();
+ private final TLDEquality equality = new TLDEquality();
@Test
- public void testSameParent() {
- Key docKey = new Key("row", "parent.document.id");
- Key otherKey = new Key("row", "parent.document.id");
- assertTrue(equality.partOf(docKey, otherKey));
- assertTrue(equality.partOf(otherKey, docKey));
+ void testSameParent() {
+ assertTrue(equality.partOf(key, key));
+ assertTrue(equality.partOf(otherKey, otherKey));
}
@Test
- public void testDifferentParents() {
- Key docKey = new Key("row", "parent.document.id");
- Key otherKey = new Key("row", "parent.document.id2");
- assertFalse(equality.partOf(docKey, otherKey));
- assertFalse(equality.partOf(otherKey, docKey));
+ void testDifferentParents() {
+ assertFalse(equality.partOf(key, otherKey));
+ assertFalse(equality.partOf(otherKey, key));
}
@Test
- public void testKeysOfDifferentDepths() {
- Key docKey = new Key("row", "parent.document.id");
- Key otherKey = new Key("row", "parent.document.id.child");
- assertFalse(equality.partOf(docKey, otherKey));
- assertFalse(equality.partOf(otherKey, docKey));
+ void testKeysOfDifferentDepths() {
+ assertTrue(equality.partOf(key, keyChildOne));
+ assertTrue(equality.partOf(keyChildOne, key));
+
+ assertTrue(equality.partOf(otherKey, otherKeyChildOne));
+ assertTrue(equality.partOf(otherKeyChildOne, otherKey));
+ }
+
+ @Test
+ void testSameParentSameChildren() {
+ assertTrue(equality.partOf(keyChildOne, keyChildOne));
+ assertTrue(equality.partOf(keyChildTwo, keyChildTwo));
+
+ assertTrue(equality.partOf(otherKeyChildOne, otherKeyChildOne));
}
@Test
- public void testSameParentSameChildren() {
- Key docKey = new Key("row", "parent.document.id.child");
- Key otherKey = new Key("row", "parent.document.id.child");
- assertTrue(equality.partOf(docKey, otherKey));
- assertTrue(equality.partOf(otherKey, docKey));
+ void testSameParentDifferentChildren() {
+ assertTrue(equality.partOf(keyChildOne, keyChildTwo));
+ assertTrue(equality.partOf(keyChildTwo, keyChildOne));
}
@Test
- public void testSameParentDifferentChildren() {
- Key docKey = new Key("row", "parent.document.id.child");
- Key otherKey = new Key("row", "parent.document.id.child2");
- assertFalse(equality.partOf(docKey, otherKey));
- assertFalse(equality.partOf(otherKey, docKey));
+ void testDifferentParentSameChildren() {
+ assertFalse(equality.partOf(keyChildOne, otherKeyChildOne));
+ assertFalse(equality.partOf(otherKeyChildOne, keyChildOne));
}
@Test
- public void testDifferentParentSameChildren() {
- Key docKey = new Key("row", "parent.document.id.child");
- Key otherKey = new Key("row", "parent.document.id2.child");
- assertFalse(equality.partOf(docKey, otherKey));
- assertFalse(equality.partOf(otherKey, docKey));
+ void testDifferentParentDifferentChildren() {
+ assertFalse(equality.partOf(keyChildTwo, otherKeyChildOne));
+ assertFalse(equality.partOf(otherKeyChildOne, keyChildTwo));
}
@Test
- public void testDifferentParentDifferentChildren() {
- Key docKey = new Key("row", "parent.document.id.child");
- Key otherKey = new Key("row", "parent.document.id2.child2");
- assertFalse(equality.partOf(docKey, otherKey));
- assertFalse(equality.partOf(otherKey, docKey));
+ void testEdgeCases() {
+ assertFalse(equality.partOf(key, new Key("", "")));
+ assertFalse(equality.partOf(new Key("", ""), key));
+
+ // in practice this should never happen
+ Key malformedUid = new Key("row", "datatype\0-7m7uk9.oz9qpy.-");
+ assertTrue(equality.partOf(key, malformedUid));
+ assertTrue(equality.partOf(malformedUid, key));
}
}
From cff78e0f1ce4a3180c77661cfe046b4bf627e0ea Mon Sep 17 00:00:00 2001
From: palindrome <31748527+hlgp@users.noreply.github.com>
Date: Wed, 1 Nov 2023 17:50:10 -0400
Subject: [PATCH 02/32] Adding more thorough edge unit tests ahead of
forthcoming refactor to guarantee functionality will remain unchanged (#2154)
Co-authored-by: hlgp
---
.../handler/edge/EdgeHandlerTestUtil.java | 13 +-
.../ProtobufEdgeDeletePreconditionTest.java | 6 +-
.../edge/ProtobufEdgePreconditionTest.java | 172 +++++++++++++++++-
3 files changed, 178 insertions(+), 13 deletions(-)
diff --git a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/EdgeHandlerTestUtil.java b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/EdgeHandlerTestUtil.java
index 453dc9fd07..cb11e7362d 100644
--- a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/EdgeHandlerTestUtil.java
+++ b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/EdgeHandlerTestUtil.java
@@ -13,10 +13,13 @@
import org.apache.log4j.Logger;
import org.junit.Assert;
+import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ListMultimap;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
+import datawave.edge.util.EdgeValue;
import datawave.ingest.config.RawRecordContainerImpl;
import datawave.ingest.data.RawRecordContainer;
import datawave.ingest.data.config.NormalizedContentInterface;
@@ -32,7 +35,8 @@ public class EdgeHandlerTestUtil {
public static final Text edgeTableName = new Text(TableName.EDGE);
public static final String NB = "\u0000";
- public static Set edgeKeyResults = new HashSet<>();
+ public static ListMultimap edgeKeyResults = ArrayListMultimap.create();
+ public static ListMultimap edgeValueResults = ArrayListMultimap.create();
private static Logger log = Logger.getLogger(EdgeHandlerTestUtil.class);
@@ -65,6 +69,7 @@ public static void processEvent(Multimap even
for (Map.Entry entry : contextWriter.getCache().entries()) {
if (entry.getKey().getTableName().equals(edgeTableName)) {
edgeKeys.add(entry.getKey().getKey());
+ edgeValueResults.put(entry.getKey().getKey().getRow().toString().replaceAll(NB, "%00;"), EdgeValue.decode(entry.getValue()).toString());
}
if (!entry.getKey().getTableName().equals(edgeTableName) || entry.getKey().getKey().isDeleted() == edgeDeleteMode) {
if (countMap.containsKey(entry.getKey().getTableName())) {
@@ -84,7 +89,11 @@ public static void processEvent(Multimap even
// check edge keys
for (Key k : edgeKeys) {
- edgeKeyResults.add(k.getRow().toString().replaceAll(NB, "%00;"));
+
+ String[] tempArr = {k.getColumnFamily().toString().replaceAll(NB, "%00;"), k.getColumnQualifier().toString().replaceAll(NB, "%00;"),
+ k.getColumnVisibility().toString(), String.valueOf(k.getTimestamp())};
+ edgeKeyResults.put(k.getRow().toString().replaceAll(NB, "%00;"), tempArr);
+
keyPrint.add("edge key: " + k.getRow().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnFamily().toString().replaceAll(NB, "%00;") + " ::: "
+ k.getColumnQualifier().toString().replaceAll(NB, "%00;") + " ::: " + k.getColumnVisibility() + " ::: " + k.getTimestamp()
+ " ::: " + k.isDeleted() + "\n");
diff --git a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDeletePreconditionTest.java b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDeletePreconditionTest.java
index 6a7918c210..4502b66896 100644
--- a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDeletePreconditionTest.java
+++ b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDeletePreconditionTest.java
@@ -102,7 +102,7 @@ public void testDeleteUnawarePreconSameGroup() {
RawRecordContainer myEvent = getEvent(conf);
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 8, true, true);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
@@ -143,7 +143,7 @@ public void testDeleteUnawarePreconDifferentGroup() {
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 12, true, true);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
@@ -188,7 +188,7 @@ public void testDeleteUnawarePreconAndedDifferentGroup() {
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 12, true, true);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
diff --git a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgePreconditionTest.java b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgePreconditionTest.java
index d0d5ba7b0c..ee0075800d 100644
--- a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgePreconditionTest.java
+++ b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgePreconditionTest.java
@@ -5,6 +5,8 @@
import java.time.Instant;
import java.time.format.DateTimeFormatter;
+import java.util.Collections;
+import java.util.Date;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
@@ -35,6 +37,7 @@
import datawave.ingest.mapreduce.SimpleDataTypeHandler;
import datawave.ingest.mapreduce.job.BulkIngestKey;
import datawave.ingest.time.Now;
+import datawave.util.time.DateHelper;
public class ProtobufEdgePreconditionTest {
@@ -44,6 +47,7 @@ public class ProtobufEdgePreconditionTest {
private static Type type = new Type("mycsv", FakeIngestHelper.class, null, new String[] {SimpleDataTypeHandler.class.getName()}, 10, null);
private static final Now now = Now.getInstance();
private Configuration conf;
+ private String loadDateStr = DateHelper.format(new Date(now.get()));
@Before
public void setup() {
@@ -62,6 +66,7 @@ public void setup() {
fields.clear();
EdgeHandlerTestUtil.edgeKeyResults.clear();
+ EdgeHandlerTestUtil.edgeValueResults.clear();
}
private RawRecordContainer getEvent(Configuration conf) {
@@ -70,6 +75,7 @@ private RawRecordContainer getEvent(Configuration conf) {
myEvent.addSecurityMarking("columnVisibility", "PRIVATE");
myEvent.setDataType(type);
myEvent.setId(UID.builder().newId());
+ myEvent.setAltIds(Collections.singleton("0016dd72-0000-827d-dd4d-001b2163ba09"));
myEvent.setConf(conf);
Instant i = Instant.from(DateTimeFormatter.ISO_INSTANT.parse("2022-10-26T01:31:53Z"));
@@ -107,7 +113,157 @@ public void testUnawarePreconSameGroup() {
RawRecordContainer myEvent = getEvent(conf);
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 8, true, false);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
+
+ // colFam
+ Assert.assertEquals("MY_EDGE_TYPE/TO-FROM", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[0]);
+
+ // colQual
+ Assert.assertEquals("20221026/MY_CSV_DATA-MY_CSV_DATA///B", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[1]);
+
+ // values
+ Assert.assertEquals(1, EdgeHandlerTestUtil.edgeValueResults.get("guppy%00;siamese").size());
+ Assert.assertEquals(
+ "count: 1, bitmask: 2, sourceValue: guppy, sinkValue: siamese, hours: , duration: , loadDate: " + loadDateStr
+ + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: ",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy%00;siamese").get(0));
+ Assert.assertEquals(1, EdgeHandlerTestUtil.edgeValueResults.get("guppy").size());
+ Assert.assertEquals(
+ "count: , bitmask: , sourceValue: guppy, sinkValue: , hours: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], duration: , loadDate: "
+ + loadDateStr + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: ",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy").get(0));
+
+ // vis and ts
+ Assert.assertEquals("PRIVATE", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[2]);
+ Assert.assertEquals("1666747913000", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[3]);
+
+ }
+
+ @Test
+ public void testUnawarePreconSameGroupEarlyActivityDate() {
+ // FELINE == 'tabby'
+
+ fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z"));
+ fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09"));
+ fields.put("FELINE", new NormalizedFieldAndValue("FELINE", "tabby", "PET", "0"));
+ fields.put("FELINE", new NormalizedFieldAndValue("FELINE", "siamese", "PET", "1"));
+ fields.put("FISH", new NormalizedFieldAndValue("FISH", "salmon", "PET", "0"));
+ fields.put("FISH", new NormalizedFieldAndValue("FISH", "guppy", "PET", "1"));
+ fields.put("ACTIVITY", new NormalizedFieldAndValue("ACTIVITY", "fetch", "THING", "0"));
+
+ ProtobufEdgeDataTypeHandler edgeHandler = new ProtobufEdgeDataTypeHandler<>();
+ TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
+ edgeHandler.setup(context);
+
+ Set expectedKeys = new HashSet<>();
+ expectedKeys.add("guppy");
+ expectedKeys.add("guppy%00;siamese");
+ expectedKeys.add("salmon");
+ expectedKeys.add("salmon%00;tabby");
+ expectedKeys.add("siamese");
+ expectedKeys.add("siamese%00;guppy");
+ expectedKeys.add("tabby");
+ expectedKeys.add("tabby%00;salmon");
+
+ RawRecordContainer myEvent = getEvent(conf);
+ myEvent.setDate(1666737913000L);
+
+ // the count is doubled since activity < event date in this test. In this case, we add 2 edges each.
+ EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 16, true, false);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
+
+ Assert.assertEquals("MY_EDGE_TYPE/TO-FROM", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[0]);
+
+ // the dates
+ Assert.assertEquals("20221025/MY_CSV_DATA-MY_CSV_DATA///A", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[1]);
+ Assert.assertEquals("20221026/MY_CSV_DATA-MY_CSV_DATA///C", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(1)[1]);
+
+ // values
+ Assert.assertEquals(2, EdgeHandlerTestUtil.edgeValueResults.get("guppy%00;siamese").size());
+ Assert.assertEquals(
+ "count: 1, bitmask: 4194304, sourceValue: guppy, sinkValue: siamese, hours: , duration: , loadDate: " + loadDateStr
+ + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: false",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy%00;siamese").get(0));
+ Assert.assertEquals(
+ "count: 1, bitmask: 2, sourceValue: guppy, sinkValue: siamese, hours: , duration: , loadDate: " + loadDateStr
+ + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: ",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy%00;siamese").get(1));
+ Assert.assertEquals(2, EdgeHandlerTestUtil.edgeValueResults.get("guppy").size());
+ Assert.assertEquals(
+ "count: , bitmask: , sourceValue: guppy, sinkValue: , hours: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], duration: , loadDate: "
+ + loadDateStr + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: false",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy").get(0));
+ Assert.assertEquals(
+ "count: , bitmask: , sourceValue: guppy, sinkValue: , hours: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], duration: , loadDate: "
+ + loadDateStr + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: ",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy").get(1));
+
+ Assert.assertEquals("PRIVATE", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[2]);
+ Assert.assertEquals("1666737913000", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[3]);
+
+ }
+
+ @Test
+ public void testUnawarePreconSameGroupVeryOldData() {
+ // FELINE == 'tabby'
+
+ fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "1966-09-08"));
+ fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09"));
+ fields.put("FELINE", new NormalizedFieldAndValue("FELINE", "tabby", "PET", "0"));
+ fields.put("FELINE", new NormalizedFieldAndValue("FELINE", "siamese", "PET", "1"));
+ fields.put("FISH", new NormalizedFieldAndValue("FISH", "salmon", "PET", "0"));
+ fields.put("FISH", new NormalizedFieldAndValue("FISH", "guppy", "PET", "1"));
+ fields.put("ACTIVITY", new NormalizedFieldAndValue("ACTIVITY", "fetch", "THING", "0"));
+
+ ProtobufEdgeDataTypeHandler edgeHandler = new ProtobufEdgeDataTypeHandler<>();
+ TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
+ edgeHandler.setup(context);
+
+ Set expectedKeys = new HashSet<>();
+ expectedKeys.add("guppy");
+ expectedKeys.add("guppy%00;siamese");
+ expectedKeys.add("salmon");
+ expectedKeys.add("salmon%00;tabby");
+ expectedKeys.add("siamese");
+ expectedKeys.add("siamese%00;guppy");
+ expectedKeys.add("tabby");
+ expectedKeys.add("tabby%00;salmon");
+
+ RawRecordContainer myEvent = getEvent(conf);
+ myEvent.setDate(0L);
+
+ // the count is doubled since activity < event date in this test. In this case, we add 2 edges each.
+ EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 16, true, false);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
+
+ Assert.assertEquals("MY_EDGE_TYPE/TO-FROM", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[0]);
+
+ // the dates
+ Assert.assertEquals("19700101/MY_CSV_DATA-MY_CSV_DATA///A", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[1]);
+ Assert.assertEquals("19660908/MY_CSV_DATA-MY_CSV_DATA///C", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(1)[1]);
+
+ // values
+ Assert.assertEquals(2, EdgeHandlerTestUtil.edgeValueResults.get("guppy%00;siamese").size());
+ Assert.assertEquals(
+ "count: 1, bitmask: , sourceValue: guppy, sinkValue: siamese, hours: , duration: , loadDate: " + loadDateStr
+ + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: false",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy%00;siamese").get(0));
+ Assert.assertEquals(
+ "count: 1, bitmask: 1, sourceValue: guppy, sinkValue: siamese, hours: , duration: , loadDate: " + loadDateStr
+ + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: ",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy%00;siamese").get(1));
+ Assert.assertEquals(2, EdgeHandlerTestUtil.edgeValueResults.get("guppy").size());
+ Assert.assertEquals(
+ "count: , bitmask: , sourceValue: guppy, sinkValue: , hours: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], duration: , loadDate: "
+ + loadDateStr + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: ",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy").get(0));
+ Assert.assertEquals(
+ "count: , bitmask: , sourceValue: guppy, sinkValue: , hours: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], duration: , loadDate: "
+ + loadDateStr + ", uuidString: , uuidObj: 0016dd72-0000-827d-dd4d-001b2163ba09, badActivityDate: false",
+ EdgeHandlerTestUtil.edgeValueResults.get("guppy").get(1));
+
+ Assert.assertEquals("PRIVATE", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[2]);
+ Assert.assertEquals("0", EdgeHandlerTestUtil.edgeKeyResults.get("guppy%00;siamese").get(0)[3]);
}
@@ -147,7 +303,7 @@ public void testUnawarePreconDifferentGroup() {
RawRecordContainer myEvent = getEvent(conf);
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 12, true, false);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
@@ -175,7 +331,7 @@ public void testAwarePreconSameGroup() {
RawRecordContainer myEvent = getEvent(conf);
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 4, true, false);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
@@ -206,7 +362,7 @@ public void testAwarePreconDifferentGroup() {
RawRecordContainer myEvent = getEvent(conf);
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 7, true, false);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
@@ -237,7 +393,7 @@ public void testAwareFieldComparison() {
RawRecordContainer myEvent = getEvent(conf);
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 4, true, false);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
@@ -260,7 +416,7 @@ public void testAwareFieldComparisonNullCheck() {
RawRecordContainer myEvent = getEvent(conf);
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 0, true, false);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
@@ -299,7 +455,7 @@ public void testAwareOrGroupsNotEqual() {
RawRecordContainer myEvent = getEvent(conf);
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 8, true, false);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
@@ -334,7 +490,7 @@ public void testAwareGreaterThanSameGroup() {
RawRecordContainer myEvent = getEvent(conf);
EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 4, true, false);
- Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults);
+ Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet());
}
}
From 1ba664f1ac47a0ce3991899947213dc91812504a Mon Sep 17 00:00:00 2001
From: Drew Farris
Date: Wed, 1 Nov 2023 16:24:51 -0400
Subject: [PATCH 03/32] Implements improved weighted SSDeep match scoring
(#2129)
* SSDeepSimilarityQuery logic match scoring improvements
* implements improved weighted SSDeep match scoring
* implements optional min score threshold parameter
* removed MATCH_RANK because it does not reflect sorted scoring
* Updated unit test assertions for SSDeepSimilarityQueryTransformerTest
* Whitespace formatting for SSDeep-related code
* Updates per code review
* Further updates per code review
* Further updates per code review
* Further updates per code review
---------
Co-authored-by: hgklohr
---
.../SSDeepSimilarityQueryTransformer.java | 78 +++++++--
.../query/util/ssdeep/NGramScoreTuple.java | 47 ++++--
.../query/util/ssdeep/SSDeepHash.java | 13 +-
.../query/util/ssdeep/SSDeepHashScorer.java | 119 ++++++++++++++
.../java/datawave/query/SSDeepQueryTest.java | 151 ++++++++++++++----
.../SSDeepSimilarityQueryTransformerTest.java | 2 +
.../util/ssdeep/SSDeepHashScorerTest.java | 40 +++++
7 files changed, 382 insertions(+), 68 deletions(-)
create mode 100644 warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHashScorer.java
create mode 100644 warehouse/query-core/src/test/java/datawave/query/util/ssdeep/SSDeepHashScorerTest.java
diff --git a/warehouse/query-core/src/main/java/datawave/query/transformer/SSDeepSimilarityQueryTransformer.java b/warehouse/query-core/src/main/java/datawave/query/transformer/SSDeepSimilarityQueryTransformer.java
index 74bd1806d9..92deebc7ad 100644
--- a/warehouse/query-core/src/main/java/datawave/query/transformer/SSDeepSimilarityQueryTransformer.java
+++ b/warehouse/query-core/src/main/java/datawave/query/transformer/SSDeepSimilarityQueryTransformer.java
@@ -22,7 +22,9 @@
import datawave.query.util.ssdeep.NGramScoreTuple;
import datawave.query.util.ssdeep.NGramTuple;
import datawave.query.util.ssdeep.SSDeepHash;
+import datawave.query.util.ssdeep.SSDeepHashScorer;
import datawave.webservice.query.Query;
+import datawave.webservice.query.QueryImpl;
import datawave.webservice.query.exception.EmptyObjectException;
import datawave.webservice.query.logic.BaseQueryLogicTransformer;
import datawave.webservice.query.result.event.EventBase;
@@ -33,6 +35,8 @@
public class SSDeepSimilarityQueryTransformer extends BaseQueryLogicTransformer,Map.Entry> {
+ public static final String MIN_SSDEEP_SCORE_PARAMETER = "minScore";
+
private static final Logger log = Logger.getLogger(SSDeepSimilarityQueryTransformer.class);
protected final Authorizations auths;
@@ -58,11 +62,17 @@ public class SSDeepSimilarityQueryTransformer extends BaseQueryLogicTransformer<
/** Tracks which ssdeep hashes each of the ngrams originated from */
final Multimap queryMap;
+ /** The maximum number of repeated characters allowed in a ssdeep hash - used to perform normalization for scoring */
+ final int maxRepeatedCharacters;
+
+ final int minScoreThreshold;
+
public SSDeepSimilarityQueryTransformer(Query query, SSDeepSimilarityQueryConfiguration config, MarkingFunctions markingFunctions,
ResponseObjectFactory responseObjectFactory) {
super(markingFunctions);
this.auths = new Authorizations(query.getQueryAuthorizations().split(","));
this.queryMap = config.getQueryMap();
+ this.maxRepeatedCharacters = config.getMaxRepeatedCharacters();
this.responseObjectFactory = responseObjectFactory;
this.bucketEncoder = new IntegerEncoding(config.getBucketEncodingBase(), config.getBucketEncodingLength());
@@ -70,6 +80,28 @@ public SSDeepSimilarityQueryTransformer(Query query, SSDeepSimilarityQueryConfig
this.chunkStart = bucketEncoder.getLength();
this.chunkEnd = chunkStart + chunkSizeEncoding.getLength();
+
+ this.minScoreThreshold = readOptionalMinScoreThreshold(query);
+ }
+
+ private int readOptionalMinScoreThreshold(Query query) {
+ QueryImpl.Parameter minScoreParameter = query.findParameter(MIN_SSDEEP_SCORE_PARAMETER);
+ if (minScoreParameter != null) {
+ String minScoreString = minScoreParameter.getParameterValue();
+ try {
+ int minScore = Integer.parseInt(minScoreString);
+ if (minScore < 0 || minScore > 100) {
+ log.warn("Ssdeep score threshold must be between 0-100, but was " + minScoreString + ", ignoring " + MIN_SSDEEP_SCORE_PARAMETER
+ + " parameter.");
+ } else {
+ return minScore;
+ }
+ } catch (NumberFormatException e) {
+ log.warn("Number format exception encountered when parsing score threshold of '" + minScoreString + "' ignoring " + MIN_SSDEEP_SCORE_PARAMETER
+ + " parameter.");
+ }
+ }
+ return 0;
}
@Override
@@ -112,12 +144,12 @@ public BaseQueryResponse generateResponseFromScores(Multimap transform(Map.Entry input) th
* @return a map of ssdeep hashes to score tuples.
*/
protected Multimap scoreQuery(Multimap queryMap, Multimap chunkPostings) {
- // score based on chunk match count
+ // The base match score based on the number of matching ngrams shared between the query hash and the matched hash
+ // This map tracks that: the query hash is the key, matching hash and score is the value.
final Map> scoredHashMatches = new TreeMap<>();
// align the chunk postings to their original query ssdeep hash and count the number of matches
- // for each chunk that corresponds to that original ssdeep hash
- chunkPostings.asMap().forEach((hash, cpc) -> {
- log.trace("Posting " + hash + " had " + cpc.size() + "chunk tuples");
- cpc.forEach(ct -> {
- Collection ssdhc = queryMap.get(ct);
- log.trace("Chunk tuple " + ct + " had " + ssdhc.size() + "related query hashes");
- ssdhc.forEach(ssdh -> {
- final Map chunkMatchCount = scoredHashMatches.computeIfAbsent(ssdh, s -> new TreeMap<>());
- final Integer score = chunkMatchCount.computeIfAbsent(hash, m -> 0);
- log.trace("Incrementing score for " + ssdh + "," + hash + " by " + cpc.size());
- chunkMatchCount.put(hash, score + 1);
+ // for each chunk that corresponds to that original ssdeep hash. The number of ngrams that the query and
+ // target have in common thus become the base score.
+ chunkPostings.asMap().forEach((matchingHash, matchingNgrams) -> {
+ log.trace("Posting " + matchingHash + " had " + matchingNgrams.size() + " matching ngrams");
+ matchingNgrams.forEach(matchingNgram -> { // for each matching hash ngram
+ Collection queryHashes = queryMap.get(matchingNgram); // find the queries that included that ngram
+ log.trace("Ngram " + matchingNgram + " had " + queryHashes.size() + " related query hashes");
+ queryHashes.forEach(queryHash -> { // increment the score for each query the ngram appeared in.
+ final Map chunkMatchCount = scoredHashMatches.computeIfAbsent(queryHash, s -> new TreeMap<>());
+ final Integer score = chunkMatchCount.computeIfAbsent(matchingHash, m -> 0);
+ log.trace("Incrementing score for " + queryHash + "," + matchingHash + " by 1");
+ chunkMatchCount.put(matchingHash, score + 1);
});
});
});
- // convert the counted chunks into tuples.
+ final SSDeepHashScorer scorer = new SSDeepHashScorer(maxRepeatedCharacters);
+
+ // convert the counted chunks into score tuples.
final Multimap scoreTuples = TreeMultimap.create();
- scoredHashMatches.forEach((sdh, cmc) -> cmc.forEach((k, v) -> scoreTuples.put(sdh, new NGramScoreTuple(k, v))));
+ scoredHashMatches.forEach((queryHash, scoredMatches) -> {
+ scoredMatches.forEach((matchingHash, baseScore) -> {
+ int weightedScore = scorer.apply(queryHash, matchingHash);
+ // keep the scored tuple if either the minScoreThreshold is not set or the weightedScore exceeds the set threshold.
+ if (minScoreThreshold <= 0 || weightedScore > minScoreThreshold) {
+ scoreTuples.put(queryHash, new NGramScoreTuple(matchingHash, baseScore, weightedScore));
+ }
+ });
+ });
return scoreTuples;
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/NGramScoreTuple.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/NGramScoreTuple.java
index ebb5905ede..d64fb0d3df 100644
--- a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/NGramScoreTuple.java
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/NGramScoreTuple.java
@@ -1,48 +1,65 @@
package datawave.query.util.ssdeep;
import java.io.Serializable;
-import java.util.Objects;
public class NGramScoreTuple implements Serializable, Comparable {
final SSDeepHash ssDeepHash;
- final float score;
+ final float baseScore;
+ final int weightedScore;
- public NGramScoreTuple(SSDeepHash ssDeepHash, float score) {
+ public NGramScoreTuple(SSDeepHash ssDeepHash, float baseScore, int weightedScore) {
this.ssDeepHash = ssDeepHash;
- this.score = score;
+ this.baseScore = baseScore;
+ this.weightedScore = weightedScore;
}
public SSDeepHash getSsDeepHash() {
return ssDeepHash;
}
- public float getScore() {
- return score;
+ public float getBaseScore() {
+ return baseScore;
+ }
+
+ public int getWeightedScore() {
+ return weightedScore;
+ }
+
+ @Override
+ public String toString() {
+ return "ScoreTuple{" + "hash=" + ssDeepHash + ", baseScore=" + baseScore + ", weightedScore=" + weightedScore + '}';
}
@Override
public boolean equals(Object o) {
if (this == o)
return true;
- if (!(o instanceof NGramScoreTuple))
+ if (o == null || getClass() != o.getClass())
return false;
+
NGramScoreTuple that = (NGramScoreTuple) o;
- return ssDeepHash == that.ssDeepHash && Float.compare(that.score, score) == 0;
- }
- @Override
- public int hashCode() {
- return Objects.hash(ssDeepHash, score);
+ if (baseScore == that.baseScore)
+ return false;
+ if (weightedScore == that.weightedScore)
+ return false;
+ return ssDeepHash.equals(that.ssDeepHash);
}
@Override
- public String toString() {
- return "ScoreTuple{" + "hash=" + ssDeepHash + ", score=" + score + '}';
+ public int hashCode() {
+ int result = ssDeepHash.hashCode();
+ result = 31 * result + (baseScore != 0.0f ? Float.floatToIntBits(baseScore) : 0);
+ result = 31 * result + weightedScore;
+ return result;
}
@Override
public int compareTo(NGramScoreTuple o) {
- int cmp = Float.compare(o.score, score);
+ int cmp = Integer.compare(o.weightedScore, weightedScore);
+ if (cmp == 0) {
+ cmp = Float.compare(o.baseScore, baseScore);
+ }
if (cmp == 0) {
cmp = ssDeepHash.compareTo(o.ssDeepHash);
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHash.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHash.java
index 0d398357e1..c1c5392e89 100644
--- a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHash.java
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHash.java
@@ -163,7 +163,8 @@ public static SSDeepHash normalize(final SSDeepHash input) {
public static SSDeepHash normalize(final SSDeepHash input, int maxRepeatedCharacters) {
final String n1 = normalizeSSDeepChunk(input.getChunk(), maxRepeatedCharacters);
final String n2 = normalizeSSDeepChunk(input.getDoubleChunk(), maxRepeatedCharacters);
- if (n1 == null && n2 == null) {
+ // we really do want '==' here, not equals. neither chunk is changed, so just return the input.
+ if (n1 == input.getChunk() && (n2 == input.getDoubleChunk())) {
return input;
}
return new SSDeepHash(input.getChunkSize(), n1 == null ? input.getChunk() : n1, n2 == null ? input.getDoubleChunk() : n2);
@@ -175,18 +176,18 @@ public SSDeepHash normalize(int maxRepeatedCharacters) {
/**
* Given a string that potentially contains long runs of repeating characters, replace such runs with at most maxRepeated characters. If the string is not
- * modified, return null.
+ * modified, return the input string.
*
* @param input
* the string to analyze and possibly modify.
* @param maxRepeatedCharacters
* the number of maxRepeatedCharacters to allow. Any String that has a run of more than this many of the same character will have that run
* collapsed to be this many characters in length. Zero indicates that no normalization should be performed.
- * @return the modified string or null if the string is not modified.
+ * @return the modified string or the original string if the string is not modified.
*/
public static String normalizeSSDeepChunk(final String input, final int maxRepeatedCharacters) {
if (maxRepeatedCharacters <= 0) {
- return null; // do nothing.
+ return input; // do nothing.
}
final char[] data = input.toCharArray();
final int length = data.length;
@@ -215,11 +216,11 @@ public static String normalizeSSDeepChunk(final String input, final int maxRepea
}
}
- // if we have modified the data, create and return a string otherwise, null
+ // if we have modified the data, create and return a string otherwise, return the input unchanged
if (destIndex < length) {
return new String(data, 0, destIndex);
} else {
- return null;
+ return input;
}
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHashScorer.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHashScorer.java
new file mode 100644
index 0000000000..fce7cb5d66
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHashScorer.java
@@ -0,0 +1,119 @@
+package datawave.query.util.ssdeep;
+
+import org.apache.commons.text.similarity.LevenshteinDistance;
+import org.apache.log4j.Logger;
+
+/** Implements functions to calculate a similarity score for a pair of SSDeepHashes */
+public class SSDeepHashScorer {
+ private static final Logger log = Logger.getLogger(SSDeepHash.class);
+
+ private final int maxRepeatedCharacters;
+
+ public SSDeepHashScorer() {
+ this(SSDeepHash.DEFAULT_MAX_REPEATED_CHARACTERS);
+ }
+
+ public SSDeepHashScorer(int maxRepeatedCharacters) {
+ this.maxRepeatedCharacters = maxRepeatedCharacters;
+ }
+
+ /**
+ * Compare two ssdeep hashes, returning a score between 0 to 100 that indicates similarity. A score of 0 means that the items are not similar at all whereas
+ * a score of 100 indicates a high degree of similarity.
+ *
+ * @param signature1
+ * the first object to be compared.
+ * @param signature2
+ * the second object to be compared.
+ * @return an integer between 0 and 100
+ */
+ public int apply(SSDeepHash signature1, SSDeepHash signature2) {
+ if ((null == signature1) || (null == signature2)) {
+ return -1;
+ }
+ final long chunkSize1 = signature1.getChunkSize();
+ final long chunkSize2 = signature2.getChunkSize();
+
+ // We require the chunk size to either be equal, or for one to be twice the other. If the chunk sizes don't
+ // match then we are comparing apples to oranges. This isn't an 'error' per se. We could have two valid
+ // ssdeep hashes, but with chunk sizes so different they can't be compared.
+ if ((chunkSize1 != chunkSize2) && (chunkSize1 != (chunkSize2 * 2)) && (chunkSize2 != (chunkSize1 * 2))) {
+ if (log.isDebugEnabled()) {
+ log.debug("block sizes too different: " + chunkSize1 + " " + chunkSize2);
+ }
+ return 0;
+ }
+
+ // There is very little information content in sequences of the same character like 'LLLLL'. Eliminate any
+ // sequences longer than MAX_REPEATED_CHARACTERS (3).
+ final String s1chunk = SSDeepHash.normalizeSSDeepChunk(signature1.getChunk(), maxRepeatedCharacters);
+ final String s1doubleChunk = SSDeepHash.normalizeSSDeepChunk(signature1.getDoubleChunk(), maxRepeatedCharacters);
+ final String s2chunk = SSDeepHash.normalizeSSDeepChunk(signature2.getChunk(), maxRepeatedCharacters);
+ final String s2doubleChunk = SSDeepHash.normalizeSSDeepChunk(signature2.getDoubleChunk(), maxRepeatedCharacters);
+
+ // Each ssdeep has two chunks with different chunk sizes. Choose which ones to use from each hash for scoring.
+ final long score;
+ if (chunkSize1 == chunkSize2) {
+ // The ssdeep chunk sizes are equal.
+ final long score1 = scoreChunks(s1chunk, s2chunk, chunkSize1);
+ final long score2 = scoreChunks(s1doubleChunk, s2doubleChunk, chunkSize2);
+ score = Math.max(score1, score2);
+ } else if (chunkSize1 == (chunkSize2 * 2)) {
+ // The first ssdeep has twice the chunk size of the second.
+ score = scoreChunks(s1chunk, s2doubleChunk, chunkSize1);
+ } else {
+ // The second ssdeep has twice the chunk size of the first.
+ score = scoreChunks(s1doubleChunk, s2chunk, chunkSize2);
+ }
+
+ return (int) score;
+ }
+
+ /**
+ * This is the low level chunk scoring algorithm. It takes two chunks and scores them on a scale of 0-100 where 0 is a terrible match and 100 is a great
+ * match. The chunkSize is used to cope with very small messages.
+ */
+ private static int scoreChunks(final String s1, final String s2, final long chunkSize) {
+ final int len1 = s1.length();
+ final int len2 = s2.length();
+
+ if ((len1 > SSDeepHash.CHUNK_LENGTH) || (len2 > SSDeepHash.CHUNK_LENGTH)) {
+ // one of the chunk lengths exceeds the max chunk length, perhaps it is not a real ssdeep?
+ return 0;
+ }
+
+ // Compute the edit distance between the two chunk strings. The edit distance gives us a pretty good idea of
+ // how closely related the two chunks are.
+ int editDistance = LevenshteinDistance.getDefaultInstance().apply(s1, s2);
+ if (log.isDebugEnabled()) {
+ log.debug("edit_dist: " + editDistance);
+ }
+
+ // Scale the edit distance by the lengths of the two chunks. This changes the baseScore to be a measure of the
+ // proportion of the message that has changed rather than an absolute quantity. It also copes with the
+ // variability of the chunk string lengths.
+ int score = (editDistance * SSDeepHash.CHUNK_LENGTH) / (len1 + len2);
+
+ // At this stage the baseScore occurs roughly on a 0-64 scale,
+ // with 0 being a good match and 64 being a complete mismatch.
+
+ // Rescale to a 0-100 scale (friendlier to humans).
+ score = (100 * score) / SSDeepHash.CHUNK_LENGTH;
+
+ // It is possible to get a baseScore above 100 here, but it is a really terrible match.
+ if (score >= 100) {
+ return 0;
+ }
+
+ // Invert the score with 0 being a poor match and 100 being a excellent match.
+ score = 100 - score;
+
+ // When the chunk size is small we don't want to exaggerate the match.
+ final int threshold = (int) (chunkSize / SSDeepHash.MIN_CHUNK_SIZE * Math.min(len1, len2));
+ if (score > threshold) {
+ score = threshold;
+ }
+
+ return score;
+ }
+}
diff --git a/warehouse/query-core/src/test/java/datawave/query/SSDeepQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/SSDeepQueryTest.java
index bb932abe2f..4b2d169de4 100644
--- a/warehouse/query-core/src/test/java/datawave/query/SSDeepQueryTest.java
+++ b/warehouse/query-core/src/test/java/datawave/query/SSDeepQueryTest.java
@@ -2,6 +2,7 @@
import static org.junit.Assert.fail;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
@@ -57,6 +58,8 @@ public class SSDeepQueryTest {
public static String[] TEST_SSDEEPS = {"12288:002r/VG4GjeZHkwuPikQ7lKH5p5H9x1beZHkwulizQ1lK55pGxlXTd8zbW:002LVG4GjeZEXi37l6Br1beZEdic1lmu",
"6144:02C3nq73v1kHGhs6y7ppFj93NRW6/ftZTgC6e8o4toHZmk6ZxoXb0ns:02C4cGCLjj9Swfj9koHEk6/Fns",
"3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6pIJLIOSP:02MKlWQ7Sg3d4bO968rm7JO",
+ "3072:03jscyaGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6pIJLIOSP:03NLmXR7Sg3d4bO968rm7JO",
+ "3072:03jscyaZZZZZYYYYXXXWWdXVmbOn5u46KjnzWWWXXXXYYYYYYZZZZZZZ:03NLmXR7ZZZYYXW9WXYYZZZ",
"48:1aBhsiUw69/UXX0x0qzNkVkydf2klA8a7Z35:155w69MXAlNkmkWTF5", "196608:wEEE+EEEEE0LEEEEEEEEEEREEEEhEEETEEEEEWUEEEJEEEEcEEEEEEEE3EEEEEEN:",
"1536:0YgNvw/OmgPgiQeI+25Nh6+RS5Qa8LmbyfAiIRgizy1cBx76UKYbD+iD/RYgNvw6:", "12288:222222222222222222222222222222222:"};
@@ -135,17 +138,6 @@ public static void loadData() throws Exception {
logSSDeepTestData(tableName);
}
- private static void logSSDeepTestData(String tableName) throws TableNotFoundException {
- Scanner scanner = accumuloClient.createScanner(tableName, auths);
- Iterator> iterator = scanner.iterator();
- log.debug("*************** " + tableName + " ********************");
- while (iterator.hasNext()) {
- Map.Entry entry = iterator.next();
- log.debug(entry);
- }
- scanner.close();
- }
-
@Before
public void setUpQuery() {
logic = new SSDeepSimilarityQueryLogic();
@@ -162,38 +154,65 @@ public void setUpQuery() {
}
@Test
- public void testSingleQuery() throws Exception {
+ /** Test that a single query ssdeep with no match score threshold returns the expected results */
+ public void testSingleQueryNoMinScore() throws Exception {
+ runSingleQuery(false);
+ }
+
+ @Test
+ /** Test that a single query ssdeep with a min score threshold returns the expected results */
+ public void testSingleQueryMinScore() throws Exception {
+ runSingleQuery(true);
+ }
+
+ private static void logSSDeepTestData(String tableName) throws TableNotFoundException {
+ Scanner scanner = accumuloClient.createScanner(tableName, auths);
+ Iterator> iterator = scanner.iterator();
+ log.debug("*************** " + tableName + " ********************");
+ while (iterator.hasNext()) {
+ Map.Entry entry = iterator.next();
+ log.debug(entry);
+ }
+ scanner.close();
+ }
+
+ public void runSingleQuery(boolean applyMinScoreThreshold) throws Exception {
String query = "CHECKSUM_SSDEEP:" + TEST_SSDEEPS[2];
- EventQueryResponseBase response = runSSDeepQuery(query);
+
+ final int minScoreThreshold = applyMinScoreThreshold ? 65 : 0;
+ final int expectedEventCount = applyMinScoreThreshold ? 2 : 3;
+
+ EventQueryResponseBase response = runSSDeepQuery(query, minScoreThreshold);
List events = response.getEvents();
int eventCount = events.size();
+ Map> observedEvents = extractObservedEvents(events);
- Map observedFields = new HashMap<>();
- if (eventCount > 0) {
- for (EventBase e : events) {
- List fields = e.getFields();
- for (FieldBase f : fields) {
- observedFields.put(f.getName(), f.getValueString());
- }
- }
- }
+ Assert.assertEquals(expectedEventCount, eventCount);
- Assert.assertFalse("Observed fields was unexpectedly empty", observedFields.isEmpty());
- Assert.assertEquals("65.0", observedFields.remove("MATCH_SCORE"));
- Assert.assertEquals("1", observedFields.remove("MATCH_RANK"));
- Assert.assertEquals("3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6pIJLIOSP:02MKlWQ7Sg3d4bO968rm7JO", observedFields.remove("QUERY_SSDEEP"));
- Assert.assertEquals("3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6pIJLIOSP:02MKlWQ7Sg3d4bO968rm7JO", observedFields.remove("MATCHING_SSDEEP"));
- Assert.assertTrue("Observed unexpected field(s): " + observedFields, observedFields.isEmpty());
- Assert.assertEquals(1, eventCount);
+ // find the fields for the self match example.
+ assertMatch(TEST_SSDEEPS[2], TEST_SSDEEPS[2], "65.0", "1", "100", observedEvents);
+
+ // find and validate the fields for the partial match example.
+ assertMatch(TEST_SSDEEPS[2], TEST_SSDEEPS[3], "51.0", "2", "96", observedEvents);
+
+ if (applyMinScoreThreshold)
+ assertNoMatch(TEST_SSDEEPS[2], TEST_SSDEEPS[3], observedEvents);
+ else
+ assertMatch(TEST_SSDEEPS[2], TEST_SSDEEPS[4], "9.0", "3", "63", observedEvents);
}
- public EventQueryResponseBase runSSDeepQuery(String query) throws Exception {
+ public EventQueryResponseBase runSSDeepQuery(String query, int minScoreThreshold) throws Exception {
+
QueryImpl q = new QueryImpl();
q.setQuery(query);
q.setId(UUID.randomUUID());
q.setPagesize(Integer.MAX_VALUE);
q.setQueryAuthorizations(auths.toString());
+ if (minScoreThreshold > 0) {
+ q.addParameter(SSDeepSimilarityQueryTransformer.MIN_SSDEEP_SCORE_PARAMETER, String.valueOf(minScoreThreshold));
+ }
+
RunningQuery runner = new RunningQuery(accumuloClient, AccumuloConnectionFactory.Priority.NORMAL, this.logic, q, "", principal,
new QueryMetricFactoryImpl());
TransformIterator transformIterator = runner.getTransformIterator();
@@ -202,4 +221,76 @@ public EventQueryResponseBase runSSDeepQuery(String query) throws Exception {
return response;
}
+
+ /** Extract the events from a set of results into an easy to manage data structure for validation */
+ public Map> extractObservedEvents(List events) {
+ int eventCount = events.size();
+ Map> observedEvents = new HashMap<>();
+ if (eventCount > 0) {
+ for (EventBase e : events) {
+ Map observedFields = new HashMap<>();
+ String querySsdeep = "UNKNOWN_QUERY";
+ String matchingSsdeep = "UNKNOWN_MATCH";
+
+ List fields = e.getFields();
+ for (FieldBase f : fields) {
+ if (f.getName().equals("QUERY_SSDEEP")) {
+ querySsdeep = f.getValueString();
+ }
+ if (f.getName().equals("MATCHING_SSDEEP")) {
+ matchingSsdeep = f.getValueString();
+ }
+ observedFields.put(f.getName(), f.getValueString());
+ }
+
+ String eventKey = querySsdeep + "#" + matchingSsdeep;
+ observedEvents.put(eventKey, observedFields);
+ }
+ }
+ return observedEvents;
+ }
+
+ /**
+ * assert that a match exists between the specified query and matching ssdeep and that the match has the expected properties
+ *
+ * @param querySsdeep
+ * the query ssdeep we expect to find in the match results
+ * @param matchingSsdeep
+ * the matching ssdeep we expect to find in the match results.
+ * @param matchScore
+ * the base match score
+ * @param matchRank
+ * the match rank
+ * @param weightedScore
+ * the weighted match score.
+ * @param observedEvents
+ * the map of observed events, created by extractObservedEvents on the event list obtained from query execution.
+ */
+ public static void assertMatch(String querySsdeep, String matchingSsdeep, String matchScore, String matchRank, String weightedScore,
+ Map> observedEvents) {
+ final Map observedFields = observedEvents.get(querySsdeep + "#" + matchingSsdeep);
+ Assert.assertNotNull("Observed fields was null", observedFields);
+ Assert.assertFalse("Observed fields was unexpectedly empty", observedFields.isEmpty());
+ Assert.assertEquals(matchScore, observedFields.remove("MATCH_SCORE"));
+ Assert.assertEquals(weightedScore, observedFields.remove("WEIGHTED_SCORE"));
+ Assert.assertEquals(querySsdeep, observedFields.remove("QUERY_SSDEEP"));
+ Assert.assertEquals(matchingSsdeep, observedFields.remove("MATCHING_SSDEEP"));
+ Assert.assertTrue("Observed unexpected field(s) in full match: " + observedFields, observedFields.isEmpty());
+ }
+
+ /**
+ * Assert that the results do not contain a match between the specified query and matching ssdeep
+ *
+ * @param querySsdeep
+ * the query ssdeep we do not expect to find in the match results
+ * @param matchingSsdeep
+ * the matching ssdeep we do not expect to find i nthe match results
+ * @param observedEvents
+ * the map of the observed events, created by extractObservedEvents on the event list obtained from query exeuction.
+ */
+ public static void assertNoMatch(String querySsdeep, String matchingSsdeep, Map> observedEvents) {
+ final Map observedFields = observedEvents.get(querySsdeep + "#" + matchingSsdeep);
+ Assert.assertTrue("Observed fields was not empty", observedFields.isEmpty());
+
+ }
}
diff --git a/warehouse/query-core/src/test/java/datawave/query/transformer/SSDeepSimilarityQueryTransformerTest.java b/warehouse/query-core/src/test/java/datawave/query/transformer/SSDeepSimilarityQueryTransformerTest.java
index 4cf1042f58..4d2399b7e4 100644
--- a/warehouse/query-core/src/test/java/datawave/query/transformer/SSDeepSimilarityQueryTransformerTest.java
+++ b/warehouse/query-core/src/test/java/datawave/query/transformer/SSDeepSimilarityQueryTransformerTest.java
@@ -6,6 +6,7 @@
import java.util.List;
import java.util.Map;
+import datawave.webservice.query.QueryImpl;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Value;
import org.easymock.EasyMock;
@@ -52,6 +53,7 @@ public class SSDeepSimilarityQueryTransformerTest {
public void basicExpects(Key k) {
EasyMock.expect(mockQuery.getQueryAuthorizations()).andReturn("A,B,C");
+ EasyMock.expect(mockQuery.findParameter("minScore")).andReturn(new QueryImpl.Parameter("minScore",""));
EasyMock.expect(mockResponseFactory.getEventQueryResponse()).andReturn(new DefaultEventQueryResponse());
EasyMock.expect(mockResponseFactory.getEvent()).andReturn(new DefaultEvent()).times(1);
EasyMock.expect(mockResponseFactory.getField()).andReturn(new DefaultField()).times(4);
diff --git a/warehouse/query-core/src/test/java/datawave/query/util/ssdeep/SSDeepHashScorerTest.java b/warehouse/query-core/src/test/java/datawave/query/util/ssdeep/SSDeepHashScorerTest.java
new file mode 100644
index 0000000000..ef9e74f4df
--- /dev/null
+++ b/warehouse/query-core/src/test/java/datawave/query/util/ssdeep/SSDeepHashScorerTest.java
@@ -0,0 +1,40 @@
+package datawave.query.util.ssdeep;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class SSDeepHashScorerTest {
+
+ public static final String[][] testData = {
+ {"3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6pIJLIOSP:02MKlWQ7Sg3d4bO968rm7JO",
+ "3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6pIJLIOSP:02MKlWQ7Sg3d4bO968rm7JO"},
+ // repeated character case
+ {"3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYEEEEEEEEEEEEEEE:02MKlWQ7Sg3d4bEEEEEEEE",
+ "3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6EEEEEEEE:02MKlWQ7Sg3d4bEEEE"},
+ // chunk difference is less than 2 scales, so we can compare these.
+ {"3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6pIJLIOSP:02MKlWQ7Sg3d4bO968rm7JO",
+ "6144:02MKlWQ7Sg3d4bO968rm7JORW6/ftZTgC6e8o4toHZmk6ZxoXb0ns:02C4cGCLjj9Swfj9koHEk6/Fns"},
+ // inverse of the last example tests symmetry
+ {"6144:02MKlWQ7Sg3d4bO968rm7JORW6/ftZTgC6e8o4toHZmk6ZxoXb0ns:02C4cGCLjj9Swfj9koHEk6/Fns",
+ "3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6pIJLIOSP:02MKlWQ7Sg3d4bO968rm7JO"},
+ // chunk mismatch case
+ {"3072:02irbxzGAFYDMxud7fKg3dXVmbOn5u46Kjnz/G8VYrs123D6pIJLIOSP:02MKlWQ7Sg3d4bO968rm7JO",
+ "48:1aBhsiUw69/UXX0x0qzNkVkydf2klA8a7Z35:155w69MXAlNkmkWTF5"},
+ // short hash case
+ {"3:aabbcc:abc", "3:aabbccdd:abcd"}, {"6:aabbcc:abc", "6:aabbccdd:abcd"}
+
+ };
+
+ public static final int[] expectedScores = {100, 100, 61, 61, 0, 6, 12};
+
+ @Test
+ public void testCompare() {
+ SSDeepHashScorer scorer = new SSDeepHashScorer();
+ for (int i = 0; i < testData.length; i++) {
+ SSDeepHash queryHash = SSDeepHash.parse(testData[i][0]);
+ SSDeepHash targetHash = SSDeepHash.parse(testData[i][1]);
+ int score = scorer.apply(queryHash, targetHash);
+ Assert.assertEquals("Expected score of " + expectedScores[i] + " for query: " + queryHash + ", target: " + targetHash, expectedScores[i], score);
+ }
+ }
+}
From 363167d2cf4a0cd8a2b73b729111ca61ffa0868c Mon Sep 17 00:00:00 2001
From: Ivan Bella
Date: Fri, 27 Oct 2023 07:12:41 -0400
Subject: [PATCH 04/32] Fixing the composite query logic to handle long running
queries (#2147)
* Fixing the composite query logic to handle long running queries
* Do not pass EmptyObjectExceptions all the way through the composite
query logic
This was resulting in was too many empty pages
* Updated to rely on RunningQuery to handle intermediate results for long
running queries in the CompositeQueryLogic
Conflicts:
web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogic.java
web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogicResultsIterator.java
Conflicts:
web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogicResultsIterator.java
---
.../query/transformer/GroupingTransform.java | 8 ++++
.../query/util/ssdeep/ChunkSizeEncoding.java | 30 ++++++-------
.../query/util/ssdeep/IntegerEncoding.java | 30 ++++++-------
.../query/util/ssdeep/SSDeepEncoding.java | 2 +-
.../logic/composite/CompositeQueryLogic.java | 42 ++++++++++++++++---
.../CompositeQueryLogicResultsIterator.java | 7 +---
.../composite/CompositeQueryLogicTest.java | 32 ++++++++++++--
7 files changed, 105 insertions(+), 46 deletions(-)
diff --git a/warehouse/query-core/src/main/java/datawave/query/transformer/GroupingTransform.java b/warehouse/query-core/src/main/java/datawave/query/transformer/GroupingTransform.java
index e94fe93e6c..1fe8ea283b 100644
--- a/warehouse/query-core/src/main/java/datawave/query/transformer/GroupingTransform.java
+++ b/warehouse/query-core/src/main/java/datawave/query/transformer/GroupingTransform.java
@@ -134,6 +134,8 @@ public Entry apply(@Nullable Entry keyDocumentEntry)
long elapsedExecutionTimeForCurrentPage = System.currentTimeMillis() - this.queryExecutionForPageStartTime;
if (elapsedExecutionTimeForCurrentPage > this.queryExecutionForPageTimeout) {
+ log.debug("Generating intermediate result because over {}ms has been reached since {}", this.queryExecutionForPageTimeout,
+ this.queryExecutionForPageStartTime);
Document intermediateResult = new Document();
intermediateResult.setIntermediateResult(true);
return Maps.immutableEntry(new Key(), intermediateResult);
@@ -142,6 +144,12 @@ public Entry apply(@Nullable Entry keyDocumentEntry)
return null;
}
+ @Override
+ public void setQueryExecutionForPageStartTime(long queryExecutionForPageStartTime) {
+ log.debug("setting query execution page start time to {}", queryExecutionForPageStartTime);
+ super.setQueryExecutionForPageStartTime(queryExecutionForPageStartTime);
+ }
+
@Override
public Entry flush() {
Document document = null;
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java
index 2545d2f34a..9cb397be20 100644
--- a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java
@@ -26,18 +26,18 @@
*/
//@formatter:on
public class ChunkSizeEncoding implements Serializable {
-
+
private static final int MIN_CHUNK_SIZE = 3;
private static final int DEFAULT_ENCODING_ALPHABET_LENGTH = HashReverse.LEXICAL_B64_TABLE.length;
-
+
private static final int DEFAULT_ENCODING_LENGTH = 1;
-
+
static final double L2 = Math.log(2);
-
+
private final IntegerEncoding chunkIndexEncoding;
-
+
final int minChunkSize;
-
+
/**
* Create a ChunkSizeEncoding with the default parameters of a 64 character encoding alphabet and a length of 1. This allows us to encode 64 distinct chunk
* index values. Chunk index 0 represents the MIN_CHUNK_SIZE. See class javadocs for more info.
@@ -45,43 +45,43 @@ public class ChunkSizeEncoding implements Serializable {
public ChunkSizeEncoding() {
this(MIN_CHUNK_SIZE, DEFAULT_ENCODING_ALPHABET_LENGTH, DEFAULT_ENCODING_LENGTH);
}
-
+
public ChunkSizeEncoding(int minChunkSize, int encodingAlphabetLength, int encodingLength) {
this.minChunkSize = minChunkSize;
this.chunkIndexEncoding = new IntegerEncoding(encodingAlphabetLength, encodingLength);
}
-
+
public long getLimit() {
return findChunkSizeIndex(chunkIndexEncoding.getLimit());
}
-
+
public int getLength() {
return chunkIndexEncoding.getLength();
}
-
+
public long findNthChunkSize(int index) {
return minChunkSize * ((long) Math.pow(2, index));
}
-
+
public int findChunkSizeIndex(long chunkSize) {
return (int) (Math.log(chunkSize / (float) minChunkSize) / L2);
}
-
+
public String encode(int chunkSize) {
int index = findChunkSizeIndex(chunkSize);
return chunkIndexEncoding.encode(index);
}
-
+
public byte[] encodeToBytes(int chunkSize, byte[] buffer, int offset) {
int index = findChunkSizeIndex(chunkSize);
return chunkIndexEncoding.encodeToBytes(index, buffer, offset);
}
-
+
public int decode(String encoded) {
int index = chunkIndexEncoding.decode(encoded);
return (int) findNthChunkSize(index);
}
-
+
public int decode(byte[] encoded, int offset) {
int index = chunkIndexEncoding.decode(encoded, offset);
return (int) findNthChunkSize(index);
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java
index 6f11163319..b7b76238ec 100644
--- a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java
@@ -7,20 +7,20 @@
* Class for encoding integers into a lexically sorted output of constant length. Employs the sorted Base64 alphabet captured in the HashReverse class.
*/
public class IntegerEncoding implements Serializable {
-
+
// The number of distinct characters used for encoding
final int base;
// the target length of the encoding
final int length;
// the max integer value we can encode, derived from the base and length parameters.
final int limit;
-
+
/**
* We are using the LEXICAL_B64_TABLE to encode integers to characters, our max base (the unique characters we use for encoding) is based on the size of
* this alphabet.
*/
private static final int MAX_BASE = HashReverse.LEXICAL_B64_TABLE.length;
-
+
/**
* Create an unsigned integer encoder that uses the specified base (up to 64) and length (which can't generate numbers larger than Integer.MAX_VALUE). This
* uses the lexically sorted Base 64 alphabet for encoding.
@@ -45,21 +45,21 @@ public IntegerEncoding(int base, int length) {
}
this.limit = (int) calculatedLimit; // truncation is fine here.
}
-
+
/** Return the maximum value this encoder can encode */
public int getLimit() {
return limit;
}
-
+
public int getLength() {
return length;
}
-
+
/** Encode the provided value, return a string result */
public String encode(int value) {
return new String(encodeToBytes(value, new byte[length], 0));
}
-
+
/**
* encode the provided value, writing the result to the provided buffer starting offset
*
@@ -75,11 +75,11 @@ public byte[] encodeToBytes(int value, byte[] buffer, int offset) {
if (value < 0 || value >= limit) {
throw new IllegalArgumentException("Can't encode " + value + " is it out of range, max: " + limit + " was: " + value);
}
-
+
if (buffer.length < offset + length) {
throw new IndexOutOfBoundsException("Can't encode a value of length " + length + " at offset " + offset + " buffer too small: " + buffer.length);
}
-
+
int remaining = value;
for (int place = length; place > 0; place--) {
final int scale = ((int) Math.pow(base, place - 1));
@@ -92,7 +92,7 @@ public byte[] encodeToBytes(int value, byte[] buffer, int offset) {
}
return buffer;
}
-
+
// TODO: make this just like encodeToBytes?
public static byte[] encodeBaseTenDigitBytes(int value) {
int remaining = value;
@@ -108,7 +108,7 @@ public static byte[] encodeBaseTenDigitBytes(int value) {
}
return results;
}
-
+
/**
* Decode the first _length_ characters in the encoded value into an integer, where length is specified in the constructor.
*
@@ -122,7 +122,7 @@ public int decode(String encodedValue) {
}
return decode(encodedValue.getBytes(StandardCharsets.UTF_8), 0);
}
-
+
/**
* decode the value contained within the provided byte[] starting at the specified offset
*
@@ -140,7 +140,7 @@ public int decode(byte[] encoded, int offset) {
if (encoded.length < offset + length) {
throw new IndexOutOfBoundsException("Can't decode a value of length " + length + " from offset " + offset + " buffer too small: " + encoded.length);
}
-
+
int result = 0;
for (int place = length; place > 0; place--) {
int pos = offset + (length - place);
@@ -150,11 +150,11 @@ public int decode(byte[] encoded, int offset) {
}
result += (int) Math.pow(base, place - 1) * value;
}
-
+
if (result > limit) {
throw new IllegalArgumentException("Can't decode input is it out of range, max: " + limit + " was: " + result);
}
-
+
return result;
}
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepEncoding.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepEncoding.java
index a067bd200a..955186cd2b 100644
--- a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepEncoding.java
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepEncoding.java
@@ -11,7 +11,7 @@ public class SSDeepEncoding implements Serializable {
public byte[] encode(String ngram) {
return encodeToBytes(ngram, new byte[ngram.length()], 0);
}
-
+
public byte[] encodeToBytes(String ngram, byte[] buffer, int offset) {
for (int i = 0; i < ngram.length(); i++) {
buffer[i + offset] = (byte) ngram.charAt(i);
diff --git a/web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogic.java b/web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogic.java
index 783524f1d2..f689fa7a80 100644
--- a/web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogic.java
+++ b/web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogic.java
@@ -36,6 +36,7 @@
import datawave.webservice.query.logic.BaseQueryLogic;
import datawave.webservice.query.logic.QueryLogic;
import datawave.webservice.query.logic.QueryLogicTransformer;
+import datawave.webservice.query.result.event.EventBase;
import datawave.webservice.result.BaseResponse;
/**
@@ -121,20 +122,38 @@ public void run() {
started = true;
}
+ // ensure we start with a reasonable page time
+ resetPageProcessingStartTime();
+
// the results queue is also an exception handler
setUncaughtExceptionHandler(results);
boolean success = false;
try {
Object last = new Object();
- if (this.getMaxResults() < 0)
+ if (this.getMaxResults() <= 0)
this.setMaxResults(Long.MAX_VALUE);
while ((null != last) && !interrupted && transformIterator.hasNext() && (resultCount < this.getMaxResults())) {
try {
last = transformIterator.next();
if (null != last) {
- log.debug(Thread.currentThread().getName() + ": Added object to results");
- results.add(last);
+ log.debug(Thread.currentThread().getName() + ": Got result");
+
+ // special logic to deal with intermediate results
+ if (last instanceof EventBase && ((EventBase) last).isIntermediateResult()) {
+ resetPageProcessingStartTime();
+ // reset the page processing time to avoid getting spammed with these
+ // let the RunningQuery handle timeouts for long-running queries
+ if (isLongRunningQuery()) {
+ last = null;
+ }
+ }
+
+ if (last != null) {
+ results.add(last);
+ resultCount++;
+ log.debug(Thread.currentThread().getName() + ": Added result to queue");
+ }
}
} catch (InterruptedException e) {
// if this was on purpose, then just log and the loop will naturally exit
@@ -146,10 +165,8 @@ public void run() {
throw new RuntimeException(e);
}
} catch (EmptyObjectException eoe) {
- // Adding an empty object exception to the results queue needs to be passed all the way out.
- results.add(eoe);
+ // ignore these
}
- resultCount++;
}
success = true;
} catch (Exception e) {
@@ -162,6 +179,9 @@ public void run() {
}
}
+ public void resetPageProcessingStartTime() {
+ logic.setPageProcessingStartTime(System.currentTimeMillis());
+ }
}
protected static final Logger log = Logger.getLogger(CompositeQueryLogic.class);
@@ -615,6 +635,16 @@ public void setPageProcessingStartTime(long pageProcessingStartTime) {
}
}
+ @Override
+ public boolean isLongRunningQuery() {
+ for (QueryLogic> l : getQueryLogics().values()) {
+ if (l.isLongRunningQuery()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
public boolean isAllMustInitialize() {
return getConfig().isAllMustInitialize();
}
diff --git a/web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogicResultsIterator.java b/web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogicResultsIterator.java
index b35c6ccb96..bb412ce53b 100644
--- a/web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogicResultsIterator.java
+++ b/web-services/query/src/main/java/datawave/webservice/query/logic/composite/CompositeQueryLogicResultsIterator.java
@@ -67,9 +67,7 @@ public boolean hasNext() {
}
}
if (nextEntry != null) {
- if (!(nextEntry instanceof EmptyObjectException)) {
- seenEntries = true;
- }
+ seenEntries = true;
return true;
}
return false;
@@ -88,9 +86,6 @@ public Object next() {
nextEntry = null;
}
}
- if (current instanceof EmptyObjectException) {
- throw new EmptyObjectException();
- }
return current;
}
diff --git a/web-services/query/src/test/java/datawave/webservice/query/logic/composite/CompositeQueryLogicTest.java b/web-services/query/src/test/java/datawave/webservice/query/logic/composite/CompositeQueryLogicTest.java
index fdce9e9203..cdc7b152b4 100644
--- a/web-services/query/src/test/java/datawave/webservice/query/logic/composite/CompositeQueryLogicTest.java
+++ b/web-services/query/src/test/java/datawave/webservice/query/logic/composite/CompositeQueryLogicTest.java
@@ -397,6 +397,11 @@ public QueryLogicTransformer getTransformer(Query settings) {
public GenericQueryConfiguration initialize(AccumuloClient client, Query settings, Set runtimeQueryAuthorizations) throws Exception {
return new TestQueryConfiguration();
}
+
+ @Override
+ public boolean isLongRunningQuery() {
+ return true;
+ }
}
public static class DifferentTestQueryLogic extends BaseQueryLogic> {
@@ -1145,7 +1150,7 @@ public void testQueryLogicWithMaxResultsOverride() throws Exception {
CompositeQueryLogic c = new CompositeQueryLogic();
// max.results.override is set to -1 when it is not passed in as it is an optional parameter
- logic1.setMaxResults(0);
+ logic1.setMaxResults(1);
logic2.setMaxResults(4);
/**
* RunningQuery.setupConnection()
@@ -1167,14 +1172,14 @@ public void testQueryLogicWithMaxResultsOverride() throws Exception {
Assert.assertTrue(o instanceof TestQueryResponse);
results.add(o);
}
- Assert.assertEquals(4, results.size());
+ Assert.assertEquals(5, results.size());
ResultsPage page = new ResultsPage(results, Status.COMPLETE);
/**
* QueryExecutorBean.next() - transform list of objects into JAXB response
*/
TestQueryResponseList response = (TestQueryResponseList) c.getEnrichedTransformer((Query) settings).createResponse(page);
- Assert.assertEquals(4, response.getResponses().size());
+ Assert.assertEquals(5, response.getResponses().size());
for (TestQueryResponse r : response.getResponses()) {
Assert.assertNotNull(r);
}
@@ -1426,6 +1431,27 @@ public void testCannotRunQueryLogic2() throws Exception {
}
+ @Test
+ public void testIsLongRunningQuery() throws Exception {
+ Map> logics = new HashMap<>();
+ TestQueryLogic logic1 = new TestQueryLogic();
+ TestQueryLogic logic2 = new TestQueryLogic();
+ logics.put("TestQueryLogic", logic1);
+ logics.put("TestQueryLogic2", logic2);
+
+ CompositeQueryLogic c = new CompositeQueryLogic();
+ c.setQueryLogics(logics);
+
+ Assert.assertFalse(c.isLongRunningQuery());
+
+ TestQueryLogic2 logic3 = new TestQueryLogic2();
+ logics.put("TestQueryLogic3", logic3);
+
+ c.setQueryLogics(logics);
+
+ Assert.assertTrue(c.isLongRunningQuery());
+ }
+
@Test
public void testAuthorizationsUpdate() throws Exception {
Map> logics = new HashMap<>();
From c5d0de145ed7c8f648355dc63d90ddba6ffe7d1f Mon Sep 17 00:00:00 2001
From: austin007008 <143425397+austin007008@users.noreply.github.com>
Date: Fri, 3 Nov 2023 08:37:02 -0400
Subject: [PATCH 05/32] fix integration (#2157)
import sort
---
.../src/main/java/datawave/query/tld/TLD.java | 3 +-
.../query/util/ssdeep/ChunkSizeEncoding.java | 30 ++++++++--------
.../query/util/ssdeep/IntegerEncoding.java | 30 ++++++++--------
.../query/util/ssdeep/SSDeepEncoding.java | 2 +-
.../query/util/ssdeep/SSDeepHashScorer.java | 36 +++++++++----------
.../SSDeepSimilarityQueryTransformerTest.java | 4 +--
6 files changed, 53 insertions(+), 52 deletions(-)
diff --git a/warehouse/query-core/src/main/java/datawave/query/tld/TLD.java b/warehouse/query-core/src/main/java/datawave/query/tld/TLD.java
index 8b5066223e..b779dd3f17 100644
--- a/warehouse/query-core/src/main/java/datawave/query/tld/TLD.java
+++ b/warehouse/query-core/src/main/java/datawave/query/tld/TLD.java
@@ -28,7 +28,8 @@ private TLD() {}
/**
* Parses the pointer (document id) from the local Field Index key's ColumnQualifier
- *
+ *
+ *
* FI Key Structure (row, cf='fi\0field', cq='value\0datatype\0uid')
*
* The uid is starts at the ColumnQualifier's second null byte, ends at the end of sequence.
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java
index 9cb397be20..2545d2f34a 100644
--- a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java
@@ -26,18 +26,18 @@
*/
//@formatter:on
public class ChunkSizeEncoding implements Serializable {
-
+
private static final int MIN_CHUNK_SIZE = 3;
private static final int DEFAULT_ENCODING_ALPHABET_LENGTH = HashReverse.LEXICAL_B64_TABLE.length;
-
+
private static final int DEFAULT_ENCODING_LENGTH = 1;
-
+
static final double L2 = Math.log(2);
-
+
private final IntegerEncoding chunkIndexEncoding;
-
+
final int minChunkSize;
-
+
/**
* Create a ChunkSizeEncoding with the default parameters of a 64 character encoding alphabet and a length of 1. This allows us to encode 64 distinct chunk
* index values. Chunk index 0 represents the MIN_CHUNK_SIZE. See class javadocs for more info.
@@ -45,43 +45,43 @@ public class ChunkSizeEncoding implements Serializable {
public ChunkSizeEncoding() {
this(MIN_CHUNK_SIZE, DEFAULT_ENCODING_ALPHABET_LENGTH, DEFAULT_ENCODING_LENGTH);
}
-
+
public ChunkSizeEncoding(int minChunkSize, int encodingAlphabetLength, int encodingLength) {
this.minChunkSize = minChunkSize;
this.chunkIndexEncoding = new IntegerEncoding(encodingAlphabetLength, encodingLength);
}
-
+
public long getLimit() {
return findChunkSizeIndex(chunkIndexEncoding.getLimit());
}
-
+
public int getLength() {
return chunkIndexEncoding.getLength();
}
-
+
public long findNthChunkSize(int index) {
return minChunkSize * ((long) Math.pow(2, index));
}
-
+
public int findChunkSizeIndex(long chunkSize) {
return (int) (Math.log(chunkSize / (float) minChunkSize) / L2);
}
-
+
public String encode(int chunkSize) {
int index = findChunkSizeIndex(chunkSize);
return chunkIndexEncoding.encode(index);
}
-
+
public byte[] encodeToBytes(int chunkSize, byte[] buffer, int offset) {
int index = findChunkSizeIndex(chunkSize);
return chunkIndexEncoding.encodeToBytes(index, buffer, offset);
}
-
+
public int decode(String encoded) {
int index = chunkIndexEncoding.decode(encoded);
return (int) findNthChunkSize(index);
}
-
+
public int decode(byte[] encoded, int offset) {
int index = chunkIndexEncoding.decode(encoded, offset);
return (int) findNthChunkSize(index);
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java
index b7b76238ec..6f11163319 100644
--- a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java
@@ -7,20 +7,20 @@
* Class for encoding integers into a lexically sorted output of constant length. Employs the sorted Base64 alphabet captured in the HashReverse class.
*/
public class IntegerEncoding implements Serializable {
-
+
// The number of distinct characters used for encoding
final int base;
// the target length of the encoding
final int length;
// the max integer value we can encode, derived from the base and length parameters.
final int limit;
-
+
/**
* We are using the LEXICAL_B64_TABLE to encode integers to characters, our max base (the unique characters we use for encoding) is based on the size of
* this alphabet.
*/
private static final int MAX_BASE = HashReverse.LEXICAL_B64_TABLE.length;
-
+
/**
* Create an unsigned integer encoder that uses the specified base (up to 64) and length (which can't generate numbers larger than Integer.MAX_VALUE). This
* uses the lexically sorted Base 64 alphabet for encoding.
@@ -45,21 +45,21 @@ public IntegerEncoding(int base, int length) {
}
this.limit = (int) calculatedLimit; // truncation is fine here.
}
-
+
/** Return the maximum value this encoder can encode */
public int getLimit() {
return limit;
}
-
+
public int getLength() {
return length;
}
-
+
/** Encode the provided value, return a string result */
public String encode(int value) {
return new String(encodeToBytes(value, new byte[length], 0));
}
-
+
/**
* encode the provided value, writing the result to the provided buffer starting offset
*
@@ -75,11 +75,11 @@ public byte[] encodeToBytes(int value, byte[] buffer, int offset) {
if (value < 0 || value >= limit) {
throw new IllegalArgumentException("Can't encode " + value + " is it out of range, max: " + limit + " was: " + value);
}
-
+
if (buffer.length < offset + length) {
throw new IndexOutOfBoundsException("Can't encode a value of length " + length + " at offset " + offset + " buffer too small: " + buffer.length);
}
-
+
int remaining = value;
for (int place = length; place > 0; place--) {
final int scale = ((int) Math.pow(base, place - 1));
@@ -92,7 +92,7 @@ public byte[] encodeToBytes(int value, byte[] buffer, int offset) {
}
return buffer;
}
-
+
// TODO: make this just like encodeToBytes?
public static byte[] encodeBaseTenDigitBytes(int value) {
int remaining = value;
@@ -108,7 +108,7 @@ public static byte[] encodeBaseTenDigitBytes(int value) {
}
return results;
}
-
+
/**
* Decode the first _length_ characters in the encoded value into an integer, where length is specified in the constructor.
*
@@ -122,7 +122,7 @@ public int decode(String encodedValue) {
}
return decode(encodedValue.getBytes(StandardCharsets.UTF_8), 0);
}
-
+
/**
* decode the value contained within the provided byte[] starting at the specified offset
*
@@ -140,7 +140,7 @@ public int decode(byte[] encoded, int offset) {
if (encoded.length < offset + length) {
throw new IndexOutOfBoundsException("Can't decode a value of length " + length + " from offset " + offset + " buffer too small: " + encoded.length);
}
-
+
int result = 0;
for (int place = length; place > 0; place--) {
int pos = offset + (length - place);
@@ -150,11 +150,11 @@ public int decode(byte[] encoded, int offset) {
}
result += (int) Math.pow(base, place - 1) * value;
}
-
+
if (result > limit) {
throw new IllegalArgumentException("Can't decode input is it out of range, max: " + limit + " was: " + result);
}
-
+
return result;
}
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepEncoding.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepEncoding.java
index 955186cd2b..a067bd200a 100644
--- a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepEncoding.java
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepEncoding.java
@@ -11,7 +11,7 @@ public class SSDeepEncoding implements Serializable {
public byte[] encode(String ngram) {
return encodeToBytes(ngram, new byte[ngram.length()], 0);
}
-
+
public byte[] encodeToBytes(String ngram, byte[] buffer, int offset) {
for (int i = 0; i < ngram.length(); i++) {
buffer[i + offset] = (byte) ngram.charAt(i);
diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHashScorer.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHashScorer.java
index fce7cb5d66..428a0405c1 100644
--- a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHashScorer.java
+++ b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/SSDeepHashScorer.java
@@ -6,17 +6,17 @@
/** Implements functions to calculate a similarity score for a pair of SSDeepHashes */
public class SSDeepHashScorer {
private static final Logger log = Logger.getLogger(SSDeepHash.class);
-
+
private final int maxRepeatedCharacters;
-
+
public SSDeepHashScorer() {
this(SSDeepHash.DEFAULT_MAX_REPEATED_CHARACTERS);
}
-
+
public SSDeepHashScorer(int maxRepeatedCharacters) {
this.maxRepeatedCharacters = maxRepeatedCharacters;
}
-
+
/**
* Compare two ssdeep hashes, returning a score between 0 to 100 that indicates similarity. A score of 0 means that the items are not similar at all whereas
* a score of 100 indicates a high degree of similarity.
@@ -33,7 +33,7 @@ public int apply(SSDeepHash signature1, SSDeepHash signature2) {
}
final long chunkSize1 = signature1.getChunkSize();
final long chunkSize2 = signature2.getChunkSize();
-
+
// We require the chunk size to either be equal, or for one to be twice the other. If the chunk sizes don't
// match then we are comparing apples to oranges. This isn't an 'error' per se. We could have two valid
// ssdeep hashes, but with chunk sizes so different they can't be compared.
@@ -43,14 +43,14 @@ public int apply(SSDeepHash signature1, SSDeepHash signature2) {
}
return 0;
}
-
+
// There is very little information content in sequences of the same character like 'LLLLL'. Eliminate any
// sequences longer than MAX_REPEATED_CHARACTERS (3).
final String s1chunk = SSDeepHash.normalizeSSDeepChunk(signature1.getChunk(), maxRepeatedCharacters);
final String s1doubleChunk = SSDeepHash.normalizeSSDeepChunk(signature1.getDoubleChunk(), maxRepeatedCharacters);
final String s2chunk = SSDeepHash.normalizeSSDeepChunk(signature2.getChunk(), maxRepeatedCharacters);
final String s2doubleChunk = SSDeepHash.normalizeSSDeepChunk(signature2.getDoubleChunk(), maxRepeatedCharacters);
-
+
// Each ssdeep has two chunks with different chunk sizes. Choose which ones to use from each hash for scoring.
final long score;
if (chunkSize1 == chunkSize2) {
@@ -65,10 +65,10 @@ public int apply(SSDeepHash signature1, SSDeepHash signature2) {
// The second ssdeep has twice the chunk size of the first.
score = scoreChunks(s1doubleChunk, s2chunk, chunkSize2);
}
-
+
return (int) score;
}
-
+
/**
* This is the low level chunk scoring algorithm. It takes two chunks and scores them on a scale of 0-100 where 0 is a terrible match and 100 is a great
* match. The chunkSize is used to cope with very small messages.
@@ -76,44 +76,44 @@ public int apply(SSDeepHash signature1, SSDeepHash signature2) {
private static int scoreChunks(final String s1, final String s2, final long chunkSize) {
final int len1 = s1.length();
final int len2 = s2.length();
-
+
if ((len1 > SSDeepHash.CHUNK_LENGTH) || (len2 > SSDeepHash.CHUNK_LENGTH)) {
// one of the chunk lengths exceeds the max chunk length, perhaps it is not a real ssdeep?
return 0;
}
-
+
// Compute the edit distance between the two chunk strings. The edit distance gives us a pretty good idea of
// how closely related the two chunks are.
int editDistance = LevenshteinDistance.getDefaultInstance().apply(s1, s2);
if (log.isDebugEnabled()) {
log.debug("edit_dist: " + editDistance);
}
-
+
// Scale the edit distance by the lengths of the two chunks. This changes the baseScore to be a measure of the
// proportion of the message that has changed rather than an absolute quantity. It also copes with the
// variability of the chunk string lengths.
int score = (editDistance * SSDeepHash.CHUNK_LENGTH) / (len1 + len2);
-
+
// At this stage the baseScore occurs roughly on a 0-64 scale,
// with 0 being a good match and 64 being a complete mismatch.
-
+
// Rescale to a 0-100 scale (friendlier to humans).
score = (100 * score) / SSDeepHash.CHUNK_LENGTH;
-
+
// It is possible to get a baseScore above 100 here, but it is a really terrible match.
if (score >= 100) {
return 0;
}
-
+
// Invert the score with 0 being a poor match and 100 being a excellent match.
score = 100 - score;
-
+
// When the chunk size is small we don't want to exaggerate the match.
final int threshold = (int) (chunkSize / SSDeepHash.MIN_CHUNK_SIZE * Math.min(len1, len2));
if (score > threshold) {
score = threshold;
}
-
+
return score;
}
}
diff --git a/warehouse/query-core/src/test/java/datawave/query/transformer/SSDeepSimilarityQueryTransformerTest.java b/warehouse/query-core/src/test/java/datawave/query/transformer/SSDeepSimilarityQueryTransformerTest.java
index 4d2399b7e4..b5b461e1c3 100644
--- a/warehouse/query-core/src/test/java/datawave/query/transformer/SSDeepSimilarityQueryTransformerTest.java
+++ b/warehouse/query-core/src/test/java/datawave/query/transformer/SSDeepSimilarityQueryTransformerTest.java
@@ -6,7 +6,6 @@
import java.util.List;
import java.util.Map;
-import datawave.webservice.query.QueryImpl;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Value;
import org.easymock.EasyMock;
@@ -29,6 +28,7 @@
import datawave.query.util.ssdeep.NGramTuple;
import datawave.query.util.ssdeep.SSDeepHash;
import datawave.webservice.query.Query;
+import datawave.webservice.query.QueryImpl;
import datawave.webservice.query.result.event.DefaultEvent;
import datawave.webservice.query.result.event.DefaultField;
import datawave.webservice.query.result.event.ResponseObjectFactory;
@@ -53,7 +53,7 @@ public class SSDeepSimilarityQueryTransformerTest {
public void basicExpects(Key k) {
EasyMock.expect(mockQuery.getQueryAuthorizations()).andReturn("A,B,C");
- EasyMock.expect(mockQuery.findParameter("minScore")).andReturn(new QueryImpl.Parameter("minScore",""));
+ EasyMock.expect(mockQuery.findParameter("minScore")).andReturn(new QueryImpl.Parameter("minScore", ""));
EasyMock.expect(mockResponseFactory.getEventQueryResponse()).andReturn(new DefaultEventQueryResponse());
EasyMock.expect(mockResponseFactory.getEvent()).andReturn(new DefaultEvent()).times(1);
EasyMock.expect(mockResponseFactory.getField()).andReturn(new DefaultField()).times(4);
From 7299f95047ea983e53e97ee44a70a80d2828fe79 Mon Sep 17 00:00:00 2001
From: Ivan Bella
Date: Tue, 17 Oct 2023 08:42:29 -0400
Subject: [PATCH 06/32] Adding a short lived cache around the remote user
operations (#2128)
---
warehouse/core/src/main/resources/CacheContext.xml | 3 +--
.../authorization/remote/RemoteUserOperationsImpl.java | 7 +++++++
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/warehouse/core/src/main/resources/CacheContext.xml b/warehouse/core/src/main/resources/CacheContext.xml
index 400bfc9e8b..a8ae9dc4c8 100644
--- a/warehouse/core/src/main/resources/CacheContext.xml
+++ b/warehouse/core/src/main/resources/CacheContext.xml
@@ -22,8 +22,7 @@
-
+
-
diff --git a/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java b/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java
index c17655c8a0..d782a03c07 100644
--- a/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java
+++ b/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java
@@ -50,6 +50,13 @@ public void init() {
}
@Override
+ @Cacheable(value = "getRemoteUser", key = "{#principal}", cacheManager = "remoteUserOperationsCacheManager")
+ public DatawavePrincipal getRemoteUser(DatawavePrincipal principal) throws AuthorizationException {
+ return UserOperations.super.getRemoteUser(principal);
+ }
+
+ @Override
+ @Cacheable(value = "listEffectiveAuthorizations", key = "{#callerObject}", cacheManager = "remoteUserOperationsCacheManager")
public AuthorizationsListBase listEffectiveAuthorizations(Object callerObject) throws AuthorizationException {
init();
final DatawavePrincipal principal = getDatawavePrincipal(callerObject);
From 1a82e8b364cf96e4b2ef4604418c8901a15ddb66 Mon Sep 17 00:00:00 2001
From: Ivan Bella
Date: Mon, 6 Nov 2023 13:52:09 +0000
Subject: [PATCH 07/32] resolve merge conflict for short lived remote user
operations cache
---
.../authorization/remote/RemoteUserOperationsImpl.java | 6 ------
1 file changed, 6 deletions(-)
diff --git a/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java b/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java
index d782a03c07..51c976e2ed 100644
--- a/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java
+++ b/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java
@@ -48,12 +48,6 @@ public void init() {
initialized = true;
}
}
-
- @Override
- @Cacheable(value = "getRemoteUser", key = "{#principal}", cacheManager = "remoteUserOperationsCacheManager")
- public DatawavePrincipal getRemoteUser(DatawavePrincipal principal) throws AuthorizationException {
- return UserOperations.super.getRemoteUser(principal);
- }
@Override
@Cacheable(value = "listEffectiveAuthorizations", key = "{#callerObject}", cacheManager = "remoteUserOperationsCacheManager")
From 32fc6df81b68687a99600918fbd728aa52588657 Mon Sep 17 00:00:00 2001
From: Ivan Bella
Date: Mon, 6 Nov 2023 19:56:49 +0000
Subject: [PATCH 08/32] formatting
---
.../security/authorization/remote/RemoteUserOperationsImpl.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java b/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java
index 51c976e2ed..e7a497c294 100644
--- a/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java
+++ b/web-services/security/src/main/java/datawave/security/authorization/remote/RemoteUserOperationsImpl.java
@@ -48,7 +48,7 @@ public void init() {
initialized = true;
}
}
-
+
@Override
@Cacheable(value = "listEffectiveAuthorizations", key = "{#callerObject}", cacheManager = "remoteUserOperationsCacheManager")
public AuthorizationsListBase listEffectiveAuthorizations(Object callerObject) throws AuthorizationException {
From 5e6c0cfa962866684eb9c40895a66561769b8656 Mon Sep 17 00:00:00 2001
From: Whitney O'Meara
Date: Tue, 7 Nov 2023 15:00:47 +0000
Subject: [PATCH 09/32] Updated the audit service submodule
---
microservices/services/audit | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/microservices/services/audit b/microservices/services/audit
index e3c3e32e94..ea0f0cefd9 160000
--- a/microservices/services/audit
+++ b/microservices/services/audit
@@ -1 +1 @@
-Subproject commit e3c3e32e94df3193ab33e9877609806f9c2a3079
+Subproject commit ea0f0cefd9974cc7ec6efe359d0005196b3eb797
From 73885fb9e8e8b8875885dfe369d93fa508e06204 Mon Sep 17 00:00:00 2001
From: Laura Schanno
Date: Mon, 13 Nov 2023 13:03:01 -0500
Subject: [PATCH 10/32] Add aggregation functionality for #GROUP_BY function
(#1914)
* Add aggregation functionality for #GROUP_BY function
* Provide the ability to aggregate fields when grouping. Specifically, add
the new functions #SUM, #MIN, #MAX, #COUNT, #AVERAGE that will determine
the aggregate value for any specified fields against the entries that
match any particular grouping.
Additionally, add GroupAggregateFields to act as a central place to
* Updated to specify a reverse model mapping separate from the
inverseModelMap to ensure unique reverse mappings
* Reduce the fields to be the mapped set of fields and avoid remapping the
fields everywhere that we do not need to.
---------
Co-authored-by: Ivan Bella
---
.../main/java/datawave/query/Constants.java | 8 +
.../java/datawave/query/QueryParameters.java | 29 +
.../common/grouping/AbstractAggregator.java | 46 +
.../common/grouping/AggregateOperation.java | 8 +
.../query/common/grouping/Aggregator.java | 81 ++
.../common/grouping/AverageAggregator.java | 157 ++
.../common/grouping/CountAggregator.java | 103 ++
.../common/grouping/DocumentGrouper.java | 717 ++++++++++
.../datawave/query/common/grouping/Field.java | 124 ++
.../common/grouping/FieldAggregator.java | 404 ++++++
.../datawave/query/common/grouping/Group.java | 168 +++
.../query/common/grouping/GroupFields.java | 466 ++++++
.../query/common/grouping/Grouping.java | 118 ++
.../common/grouping/GroupingAttribute.java | 62 +
.../query/common/grouping/GroupingUtil.java | 330 -----
.../query/common/grouping/GroupingUtils.java | 267 ++++
.../query/common/grouping/Groups.java | 108 ++
.../common/grouping/ImmutableGrouping.java | 95 ++
.../query/common/grouping/MaxAggregator.java | 110 ++
.../query/common/grouping/MinAggregator.java | 113 ++
.../query/common/grouping/SumAggregator.java | 117 ++
.../query/config/ShardQueryConfiguration.java | 34 +-
.../query/iterator/GroupingIterator.java | 118 +-
.../query/iterator/QueryIterator.java | 4 +-
.../datawave/query/iterator/QueryOptions.java | 14 +-
.../query/jexl/functions/QueryFunctions.java | 7 +-
.../functions/QueryFunctionsDescriptor.java | 5 +
.../QueryOptionsFromQueryVisitor.java | 33 +-
.../language/functions/jexl/Average.java | 48 +
.../query/language/functions/jexl/Count.java | 48 +
.../query/language/functions/jexl/Max.java | 48 +
.../query/language/functions/jexl/Min.java | 48 +
.../query/language/functions/jexl/Sum.java | 48 +
.../query/planner/DefaultQueryPlanner.java | 28 +-
.../query/planner/QueryOptionsSwitch.java | 61 +-
.../query/tables/ShardQueryLogic.java | 73 +-
.../query/transformer/GroupingTransform.java | 97 +-
.../grouping/AverageAggregatorTest.java | 73 +
.../common/grouping/CountAggregatorTest.java | 52 +
.../common/grouping/DocumentGrouperTest.java | 1100 ++++++++++++++
.../common/grouping/GroupFieldsTest.java | 247 ++++
.../common/grouping/MaxAggregatorTest.java | 123 ++
.../common/grouping/MinAggregatorTest.java | 119 ++
.../common/grouping/SumAggregatorTest.java | 74 +
.../config/ShardQueryConfigurationTest.java | 8 +-
.../query/transformer/GroupingTest.java | 1262 +++++++++++------
.../transformer/GroupingTestWithModel.java | 534 -------
.../query/util/VisibilityWiseGuysIngest.java | 98 +-
.../VisibilityWiseGuysIngestWithModel.java | 98 +-
.../test/java/datawave/test/GroupAssert.java | 113 ++
.../test/java/datawave/test/GroupsAssert.java | 42 +
.../datawave/query/QueryLogicFactory.xml | 5 +
.../datawave/query/QueryLogicFactory.xml | 5 +
53 files changed, 6656 insertions(+), 1642 deletions(-)
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/AbstractAggregator.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/AggregateOperation.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/Aggregator.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/AverageAggregator.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/CountAggregator.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/DocumentGrouper.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/Field.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/FieldAggregator.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/Group.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupFields.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/Grouping.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupingAttribute.java
delete mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupingUtil.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupingUtils.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/Groups.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/ImmutableGrouping.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/MaxAggregator.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/MinAggregator.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/common/grouping/SumAggregator.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/Average.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/Count.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/Max.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/Min.java
create mode 100644 warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/Sum.java
create mode 100644 warehouse/query-core/src/test/java/datawave/query/common/grouping/AverageAggregatorTest.java
create mode 100644 warehouse/query-core/src/test/java/datawave/query/common/grouping/CountAggregatorTest.java
create mode 100644 warehouse/query-core/src/test/java/datawave/query/common/grouping/DocumentGrouperTest.java
create mode 100644 warehouse/query-core/src/test/java/datawave/query/common/grouping/GroupFieldsTest.java
create mode 100644 warehouse/query-core/src/test/java/datawave/query/common/grouping/MaxAggregatorTest.java
create mode 100644 warehouse/query-core/src/test/java/datawave/query/common/grouping/MinAggregatorTest.java
create mode 100644 warehouse/query-core/src/test/java/datawave/query/common/grouping/SumAggregatorTest.java
delete mode 100644 warehouse/query-core/src/test/java/datawave/query/transformer/GroupingTestWithModel.java
create mode 100644 warehouse/query-core/src/test/java/datawave/test/GroupAssert.java
create mode 100644 warehouse/query-core/src/test/java/datawave/test/GroupsAssert.java
diff --git a/warehouse/query-core/src/main/java/datawave/query/Constants.java b/warehouse/query-core/src/main/java/datawave/query/Constants.java
index bf1141a76e..c1ab9ff4f5 100644
--- a/warehouse/query-core/src/main/java/datawave/query/Constants.java
+++ b/warehouse/query-core/src/main/java/datawave/query/Constants.java
@@ -30,8 +30,16 @@ public class Constants {
public static final String BRACKET_END = "]";
+ public static final String EQUALS = "=";
+
public static final String FORWARD_SLASH = "/";
+ public static final String LEFT_PAREN = "(";
+
+ public static final String RIGHT_PAREN = ")";
+
+ public static final String PIPE = "|";
+
public static final Text TEXT_NULL = new Text(NULL);
public static final Text FI_PREFIX = new Text("fi");
diff --git a/warehouse/query-core/src/main/java/datawave/query/QueryParameters.java b/warehouse/query-core/src/main/java/datawave/query/QueryParameters.java
index a0f0e84d34..8a2ce76e09 100644
--- a/warehouse/query-core/src/main/java/datawave/query/QueryParameters.java
+++ b/warehouse/query-core/src/main/java/datawave/query/QueryParameters.java
@@ -142,7 +142,36 @@ public class QueryParameters {
*/
public static final String MATCHING_FIELD_SETS = "matching.field.sets";
+ /**
+ * Used to specify fields to perform a group-by with.
+ */
public static final String GROUP_FIELDS = "group.fields";
+
+ /**
+ * Used to specify the fields for which a sum should be calculated in groups resulting from a group-by operation.
+ */
+ public static final String SUM_FIELDS = "sum.fields";
+
+ /**
+ * Used to specify the fields for which the max should be found in groups resulting from a group-by operation.
+ */
+ public static final String MAX_FIELDS = "max.fields";
+
+ /**
+ * Used to specify the fields for which the min should be found in groups resulting from a group-by operation.
+ */
+ public static final String MIN_FIELDS = "min.fields";
+
+ /**
+ * Used to specify the fields for which a count should be calculated in groups resulting from a group-by operation.
+ */
+ public static final String COUNT_FIELDS = "count.fields";
+
+ /**
+ * Used to specify the fields for which an average should be calculated in groups resulting from a group-by operation.
+ */
+ public static final String AVERAGE_FIELDS = "average.fields";
+
public static final String GROUP_FIELDS_BATCH_SIZE = "group.fields.batch.size";
public static final String UNIQUE_FIELDS = "unique.fields";
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/AbstractAggregator.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/AbstractAggregator.java
new file mode 100644
index 0000000000..97f5a176a9
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/AbstractAggregator.java
@@ -0,0 +1,46 @@
+package datawave.query.common.grouping;
+
+import java.util.Set;
+
+import org.apache.accumulo.core.security.ColumnVisibility;
+
+import datawave.query.attributes.Attribute;
+
+/**
+ * Abstract implementation of {@link Aggregator}
+ *
+ * @param
+ * the aggregation result type
+ */
+public abstract class AbstractAggregator implements Aggregator {
+
+ /**
+ * The name of the field being aggregated.
+ */
+ protected final String field;
+
+ protected AbstractAggregator(String field) {
+ this.field = field;
+ }
+
+ @Override
+ public abstract AggregateOperation getOperation();
+
+ @Override
+ public String getField() {
+ return this.field;
+ }
+
+ @Override
+ public abstract Set getColumnVisibilities();
+
+ @Override
+ public abstract AGGREGATE getAggregation();
+
+ @Override
+ public abstract void aggregate(Attribute> value);
+
+ @Override
+ public abstract void merge(Aggregator> other);
+
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/AggregateOperation.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/AggregateOperation.java
new file mode 100644
index 0000000000..6cf2d0e2ad
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/AggregateOperation.java
@@ -0,0 +1,8 @@
+package datawave.query.common.grouping;
+
+/**
+ * Represents the aggregation operations currently available.
+ */
+public enum AggregateOperation {
+ SUM, MAX, MIN, COUNT, AVERAGE
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/Aggregator.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/Aggregator.java
new file mode 100644
index 0000000000..ec4768e5ed
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/Aggregator.java
@@ -0,0 +1,81 @@
+package datawave.query.common.grouping;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.accumulo.core.security.ColumnVisibility;
+
+import datawave.query.attributes.Attribute;
+
+/**
+ * Provides the methods by which aggregates can be calculated for fields when grouped by other fields.
+ *
+ * @param
+ * the aggregate result type
+ */
+public interface Aggregator {
+
+ /**
+ * Return the aggregate operation being performed.
+ *
+ * @return the aggregate operation
+ */
+ AggregateOperation getOperation();
+
+ /**
+ * Return the field being aggregated.
+ *
+ * @return the field
+ */
+ String getField();
+
+ /**
+ * Returns an unmodifiable set of all distinct column visibilities for each attribute aggregated into this aggregator. Possibly empty, but never null.
+ *
+ * @return a set of the column visibilities
+ */
+ Set getColumnVisibilities();
+
+ /**
+ * Return the aggregation result.
+ *
+ * @return the aggregation
+ */
+ AGGREGATE getAggregation();
+
+ /**
+ * Return true if this aggregator has aggregated at least one attribute.
+ *
+ * @return true if this aggregator has at least one attribute aggregated to it, or false otherwise
+ */
+ boolean hasAggregation();
+
+ /**
+ * Aggregate the given value into this aggregator.
+ *
+ * @param value
+ * the value to aggregate
+ */
+ void aggregate(Attribute> value);
+
+ /**
+ * Aggregate each of the given values into this aggregator.
+ *
+ * @param values
+ * the value to aggregate
+ */
+ default void aggregateAll(Collection> values) {
+ values.forEach(this::aggregate);
+ }
+
+ /**
+ * Merges the given aggregator into this aggregator
+ *
+ * @param other
+ * the aggregator to merge
+ * @throws IllegalArgumentException
+ * if the other aggregator is not the same type as this aggregator
+ */
+ void merge(Aggregator> other);
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/AverageAggregator.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/AverageAggregator.java
new file mode 100644
index 0000000000..a717ddd5cd
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/AverageAggregator.java
@@ -0,0 +1,157 @@
+package datawave.query.common.grouping;
+
+import java.math.BigDecimal;
+import java.math.MathContext;
+import java.math.RoundingMode;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.accumulo.core.security.ColumnVisibility;
+import org.apache.commons.lang.builder.ToStringBuilder;
+
+import datawave.query.attributes.Attribute;
+import datawave.query.attributes.Numeric;
+import datawave.query.attributes.TypeAttribute;
+
+/**
+ * Calculates the average value of aggregated field values. This is limited to fields for which their values can be parsed as {@link BigDecimal} instances.
+ */
+public class AverageAggregator extends AbstractAggregator {
+
+ private static final MathContext MATH_CONTEXT = new MathContext(10, RoundingMode.HALF_UP);
+
+ /**
+ * The current numerator value of the average.
+ */
+ private BigDecimal numerator;
+
+ /**
+ * The current divisor value of the average.
+ */
+ private BigDecimal divisor;
+
+ /**
+ * The current average value.
+ */
+ private BigDecimal average;
+
+ /**
+ * The column visibilities of all attributes aggregated.
+ */
+ private final Set columnVisibilities;
+
+ public static AverageAggregator of(String field, TypeAttribute numerator, TypeAttribute divisor) {
+ return new AverageAggregator(field, numerator.getType().getDelegate(), divisor.getType().getDelegate(), numerator.getColumnVisibility());
+ }
+
+ public AverageAggregator(String field) {
+ super(field);
+ this.columnVisibilities = new HashSet<>();
+ }
+
+ private AverageAggregator(String field, BigDecimal numerator, BigDecimal divisor, ColumnVisibility columnVisibility) {
+ this(field);
+ this.numerator = numerator;
+ this.divisor = divisor;
+ this.average = numerator.divide(divisor, MATH_CONTEXT);
+ if (columnVisibility != null) {
+ this.columnVisibilities.add(columnVisibility);
+ }
+ }
+
+ /**
+ * Returns {@link AggregateOperation#AVERAGE}.
+ *
+ * @return {@link AggregateOperation#AVERAGE}
+ */
+ @Override
+ public AggregateOperation getOperation() {
+ return AggregateOperation.AVERAGE;
+ }
+
+ @Override
+ public Set getColumnVisibilities() {
+ return Collections.unmodifiableSet(columnVisibilities);
+ }
+
+ /**
+ * Return the average value seen for the field.
+ *
+ * @return the average value, or null if no values have been aggregated yet
+ */
+ @Override
+ public BigDecimal getAggregation() {
+ return average;
+ }
+
+ @Override
+ public boolean hasAggregation() {
+ return average != null;
+ }
+
+ /**
+ * Return the current sum for the field values.
+ *
+ * @return the sum
+ */
+ public BigDecimal getNumerator() {
+ return numerator;
+ }
+
+ /**
+ * Return the current count for the field.
+ *
+ * @return the count
+ */
+ public BigDecimal getDivisor() {
+ return divisor;
+ }
+
+ /**
+ * Adds the value into the current sum and increments the total count by one. The average will be recalculated the next time {@link #getAggregation()} is
+ * called.
+ *
+ * @param value
+ * the value to aggregate
+ * @throws IllegalArgumentException
+ * if the given value is not a {@link Numeric} type
+ */
+ @Override
+ public void aggregate(Attribute> value) {
+ BigDecimal number;
+ try {
+ number = new BigDecimal(value.getData().toString());
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Unable to calculate an average with non-numerical value '" + value.getData() + "'", e);
+ }
+ if (numerator == null) {
+ numerator = number;
+ divisor = BigDecimal.ONE;
+ } else {
+ numerator = numerator.add(number);
+ divisor = divisor.add(BigDecimal.ONE);
+ }
+ average = numerator.divide(divisor, MATH_CONTEXT);
+ columnVisibilities.add(value.getColumnVisibility());
+ }
+
+ @Override
+ public void merge(Aggregator> other) {
+ if (other instanceof AverageAggregator) {
+ AverageAggregator aggregator = (AverageAggregator) other;
+ this.numerator = numerator.add(aggregator.numerator);
+ this.divisor = divisor.add(aggregator.divisor);
+ this.average = this.numerator.divide(this.divisor, MATH_CONTEXT);
+ this.columnVisibilities.addAll(aggregator.columnVisibilities);
+ } else {
+ throw new IllegalArgumentException("Cannot merge instance of " + other.getClass().getName());
+ }
+ }
+
+ @Override
+ public String toString() {
+ return new ToStringBuilder(this).append("field", field).append("average", average).append("numerator", numerator).append("divisor", divisor)
+ .append("columnVisibilities", columnVisibilities).toString();
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/CountAggregator.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/CountAggregator.java
new file mode 100644
index 0000000000..1144d1f298
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/CountAggregator.java
@@ -0,0 +1,103 @@
+package datawave.query.common.grouping;
+
+import java.math.BigDecimal;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.accumulo.core.security.ColumnVisibility;
+import org.apache.commons.lang.builder.ToStringBuilder;
+
+import datawave.query.attributes.Attribute;
+import datawave.query.attributes.TypeAttribute;
+
+/**
+ * Determines the total count of aggregated field values. This supports values of all {@link Attribute} types.
+ */
+public class CountAggregator extends AbstractAggregator {
+
+ /**
+ * The total number of times the field was seen.
+ */
+ private long count;
+
+ /**
+ * The column visibilities of all attributes aggregated.
+ */
+ private final Set columnVisibilities;
+
+ public static CountAggregator of(String field, TypeAttribute attribute) {
+ return new CountAggregator(field, attribute.getType().getDelegate().longValue(), attribute.getColumnVisibility());
+ }
+
+ public CountAggregator(String field) {
+ super(field);
+ this.columnVisibilities = new HashSet<>();
+ }
+
+ private CountAggregator(String field, long count, ColumnVisibility visibility) {
+ this(field);
+ this.count = count;
+ if (visibility != null) {
+ columnVisibilities.add(visibility);
+ }
+ }
+
+ /**
+ * Returns {@link AggregateOperation#COUNT}.
+ *
+ * @return {@link AggregateOperation#COUNT}
+ */
+ @Override
+ public AggregateOperation getOperation() {
+ return AggregateOperation.COUNT;
+ }
+
+ @Override
+ public Set getColumnVisibilities() {
+ return Collections.unmodifiableSet(columnVisibilities);
+ }
+
+ /**
+ * Return the total number of times a field was seen.
+ *
+ * @return the total count
+ */
+ @Override
+ public Long getAggregation() {
+ return count;
+ }
+
+ @Override
+ public boolean hasAggregation() {
+ return count > 0L;
+ }
+
+ /**
+ * Increments the current count by 1.
+ *
+ * @param value
+ * the value to aggregate
+ */
+ @Override
+ public void aggregate(Attribute> value) {
+ count++;
+ this.columnVisibilities.add(value.getColumnVisibility());
+ }
+
+ @Override
+ public void merge(Aggregator> other) {
+ if (other instanceof CountAggregator) {
+ CountAggregator aggregator = (CountAggregator) other;
+ this.count += aggregator.count;
+ this.columnVisibilities.addAll(aggregator.columnVisibilities);
+ } else {
+ throw new IllegalArgumentException("Cannot merge instance of " + other.getClass().getName());
+ }
+ }
+
+ @Override
+ public String toString() {
+ return new ToStringBuilder(this).append("field", field).append("count", count).append("columnVisibilities", columnVisibilities).toString();
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/DocumentGrouper.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/DocumentGrouper.java
new file mode 100644
index 0000000000..af98c779c7
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/DocumentGrouper.java
@@ -0,0 +1,717 @@
+package datawave.query.common.grouping;
+
+import static org.slf4j.LoggerFactory.getLogger;
+
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.accumulo.core.data.Key;
+import org.javatuples.Pair;
+import org.slf4j.Logger;
+
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Sets;
+
+import datawave.data.type.Type;
+import datawave.query.attributes.Attribute;
+import datawave.query.attributes.Document;
+import datawave.query.attributes.TypeAttribute;
+
+/**
+ * This class provides the primary functionality needed to group documents and aggregate field values within identified groups (regardless if done server or
+ * client-side).
+ *
+ *
+ * Grouping
+ *
+ * Grouping fields across documents will result in groupings of distinct value groupings for each specified field to group, as well as the total number of times
+ * each particular grouping combination was seen. Fields to group by can be specified by the following options:
+ *
+ * - The LUCENE function {@code #GROUPBY()}.
+ * - The JEXL function {@code f:groupby()}.
+ * - The query parameter {@code group.fields}.
+ *
+ * Groupings may be of any size that encompass none, some, or all of the target group fields. If a document has no entries for any of the target group fields,
+ * it will be grouped as part of an 'empty' grouping, and all target aggregation entries will be aggregated to the empty grouping. The count for 'empty' groups
+ * will be the same as the number of documents seen without any group-by fields. Values are grouped together based on the format of each document entry's key,
+ * which may have one of the following formats:
+ *
+ * - {@code }
+ * - {@code .}
+ * - {@code ....}
+ *
+ * Values of fields with the same context and instance are considered direct one-to-one grouping matches, and will be placed within the same groupings. Direct
+ * matches cannot be determined for values of fields that do not have a context, and as such they will be combined with each possible grouping, effectively a
+ * cartesian product. Direct matches are prioritized and found first before indirect matches are combined with them.
+ *
+ *
+ * Aggregation
+ *
+ * Once all valid groupings have been identified and counted, aggregation can be performed on the values of any specified fields for each grouping. The
+ * aggregation fields can differ from the group-by fields. The following aggregation operations are supported:
+ *
+ *
+ * SUM: Sum up all the values for specified fields across groupings. This operation is limited to fields with numerical values. Fields may be
+ * specified via:
+ *
+ * - The LUCENE function {@code #SUM()}.
+ * - The JEXL function {@code f:sum()}.
+ * - The query parameter {@code sum.fields}.
+ *
+ * MAX: Find the max values for specified fields across groupings. Fields may be specified via:
+ *
+ * - The LUCENE function {@code #MAX()}.
+ * - The JEXL function {@code f:max()}.
+ * - The query parameter {@code max.fields}.
+ *
+ * MIN: Find the min values for specified fields across groupings. Fields may be specified via:
+ *
+ * - The LUCENE function {@code #MIN()}.
+ * - The JEXL function {@code f:min()}.
+ * - The query parameter {@code min.fields}.
+ *
+ * COUNT: Count the number of times values were seen for specified fields across groupings. Fields may be specified via:
+ *
+ * - The LUCENE function {@code #COUNT()}.
+ * - The JEXL function {@code f:count()}.
+ * - The query parameter {@code count.fields}.
+ *
+ * AVERAGE: Find the average of all values for specified fields across groupings. This operation is limited to fields with numerical values.
+ * Fields may be specified via:
+ *
+ * - The LUCENE function {@code #AVERAGE()}.
+ * - The JEXL function {@code f:average()}.
+ * - The query parameter {@code average.fields}.
+ *
+ */
+public class DocumentGrouper {
+
+ private static final Logger log = getLogger(DocumentGrouper.class);
+
+ public static final String GROUP_COUNT = "COUNT";
+ public static final String FIELD_SUM_SUFFIX = "_SUM";
+ public static final String FIELD_MAX_SUFFIX = "_MAX";
+ public static final String FIELD_MIN_SUFFIX = "_MIN";
+ public static final String FIELD_AVERAGE_NUMERATOR_SUFFIX = "_AVERAGE_NUMERATOR";
+ public static final String FIELD_AVERAGE_DIVISOR_SUFFIX = "_AVERAGE_DIVISOR";
+ public static final String FIELD_AVERAGE_SUFFIX = "_AVERAGE";
+ public static final String FIELD_COUNT_SUFFIX = "_COUNT";
+
+ /**
+ * Groups and aggregates fields from the entries in the given document and merges the new group information into the given {@link Groups} instance.
+ *
+ * @param entry
+ * the document entry
+ * @param groupFields
+ * the fields to group and aggregate
+ * @param groups
+ * the {@link Groups} instance to merge newly found groups into
+ */
+ public static void group(Map.Entry entry, GroupFields groupFields, Groups groups) {
+ DocumentGrouper documentGrouper = new DocumentGrouper(entry, groupFields, groups);
+ documentGrouper.group();
+ }
+
+ private final Key documentKey;
+ private final Document document;
+ private final Set groupFields;
+ private final Map reverseModelMappings;
+ private final FieldAggregator.Factory fieldAggregatorFactory;
+
+ private final Groups groups;
+ private final Groups currentGroups = new Groups();
+ private final FieldIndex groupFieldsIndex = new FieldIndex(false);
+ private final FieldIndex aggregateFieldsIndex = new FieldIndex(true);
+ private final Multimap,Grouping> groupingContextAndInstancesSeenForGroups = HashMultimap.create();
+ private final int maxGroupSize;
+
+ private DocumentGrouper(Map.Entry documentEntry, GroupFields groupFields, Groups groups) {
+ this.documentKey = documentEntry.getKey();
+ this.document = documentEntry.getValue();
+ this.groupFields = groupFields.getGroupByFields();
+ this.fieldAggregatorFactory = groupFields.getFieldAggregatorFactory();
+ this.reverseModelMappings = groupFields.getReverseModelMap();
+ this.groups = groups;
+ this.maxGroupSize = this.groupFields.size();
+ }
+
+ /**
+ * Identify valid groups in the given document and aggregate relevant events to those groups.
+ */
+ private void group() {
+ log.trace("apply to {} {}", documentKey, document);
+ // If the document contains entries that indicate grouping has already been performed, we are seeing a document that was generated by
+ // GroupingIterator.flatten(). No further grouping can occur. Extract the grouping information from the document and merge them into the current groups.
+ if (isDocumentAlreadyGrouped()) {
+ extractGroupsFromDocument();
+ } else { // Otherwise, the document contains entries that have not yet been grouped and counted.
+ // Index the document entries.
+ indexDocumentEntries();
+ // Group the document entries.
+ groupEntries();
+ // Aggregate fields only if there were aggregation fields specified and if any entries for aggregation were found.
+ if (fieldAggregatorFactory.hasFieldsToAggregate() && !aggregateFieldsIndex.isEmpty()) {
+ aggregateEntries();
+ }
+
+ // Merge the groups and aggregations we found in this particular group-by operation into the groups passed by the user. The separation is required
+ // to ensure that any grouping and aggregation done in this session was applied only to the current document.
+ this.groups.mergeAll(currentGroups);
+ }
+ }
+
+ /**
+ * Return whether the document contains entries representing a flattened set of group counts generated by {@link datawave.query.iterator.GroupingIterator}.
+ *
+ * @return true if the document contains flattened group counts, or false otherwise.
+ */
+ private boolean isDocumentAlreadyGrouped() {
+ return document.getDictionary().keySet().stream().anyMatch(key -> key.startsWith(GROUP_COUNT));
+ }
+
+ /**
+ * Extract grouping information from the current document and add them to the current groups. Each field will be remapped if a reverse-model mapping was
+ * supplied.
+ */
+ @SuppressWarnings("unchecked")
+ private void extractGroupsFromDocument() {
+ // Parse a field from each entry and store them in instanceToFields. The id indicates which grouping, count, and aggregated values go together.
+ Multimap idToFields = HashMultimap.create();
+ for (Map.Entry>> entry : document.entrySet()) {
+ Field field = parseField(entry);
+ idToFields.put(field.getInstance(), field);
+ }
+ // For each distinct grouping, parse and write the grouping information to the current groups.
+ for (String instance : idToFields.keySet()) {
+ // The distinct grouping.
+ Grouping grouping = new Grouping();
+ // The aggregated values.
+ FieldAggregator fieldAggregator = new FieldAggregator();
+ // The total times the grouping was seen.
+ int count = 0;
+ for (Field field : idToFields.get(instance)) {
+ // We found the group count.
+ if (field.getBase().equals(GROUP_COUNT)) {
+ TypeAttribute attribute = (TypeAttribute) field.getAttribute();
+ count = attribute.getType().getDelegate().intValue();
+ // We found the sum of an aggregated field.
+ } else if (field.getBase().endsWith(FIELD_SUM_SUFFIX)) {
+ TypeAttribute attribute = (TypeAttribute) field.getAttribute();
+ String fieldName = removeSuffix(field.getBase(), FIELD_SUM_SUFFIX);
+ fieldAggregator.mergeAggregator(SumAggregator.of(fieldName, attribute));
+ // We found the numerator of the average of an aggregated field.
+ } else if (field.getBase().endsWith(FIELD_AVERAGE_NUMERATOR_SUFFIX)) {
+ String unmappedFieldName = removeSuffix(field.getBase(), FIELD_AVERAGE_NUMERATOR_SUFFIX);
+ String fieldName = removeSuffix(field.getBase(), FIELD_AVERAGE_NUMERATOR_SUFFIX);
+ // It's possible that the divisor will be stored under a previously unmapped field name. For example, the field ETA from
+ // ETA_AVERAGE_NUMERATOR.1 could be mapped to AG here. Use the original field name (e.g. ETA) to ensure we find the
+ // corresponding divisor (e.g. ETA_AVERAGE_DIVISOR.1) for the numerator.
+ String divisorField = unmappedFieldName + FIELD_AVERAGE_DIVISOR_SUFFIX + "." + field.getInstance();
+ TypeAttribute divisorAttribute = (TypeAttribute) document.get(divisorField);
+ TypeAttribute numeratorAttribute = (TypeAttribute) field.getAttribute();
+ fieldAggregator.mergeAggregator(AverageAggregator.of(fieldName, numeratorAttribute, divisorAttribute));
+ // We found the count of an aggregated field.
+ } else if (field.getBase().endsWith(FIELD_COUNT_SUFFIX)) {
+ TypeAttribute attribute = (TypeAttribute) field.getAttribute();
+ String fieldName = removeSuffix(field.getBase(), FIELD_COUNT_SUFFIX);
+ fieldAggregator.mergeAggregator(CountAggregator.of(fieldName, attribute));
+ // We found the min of an aggregated field.
+ } else if (field.getBase().endsWith(FIELD_MIN_SUFFIX)) {
+ String fieldName = removeSuffix(field.getBase(), FIELD_MIN_SUFFIX);
+ fieldAggregator.mergeAggregator(MinAggregator.of(fieldName, field.getAttribute()));
+ // We found the max of an aggregated field.
+ } else if (field.getBase().endsWith(FIELD_MAX_SUFFIX)) {
+ String fieldName = removeSuffix(field.getBase(), FIELD_MAX_SUFFIX);
+ fieldAggregator.mergeAggregator(MaxAggregator.of(fieldName, field.getAttribute()));
+ // We found a field that is part of the grouping.
+ } else if (!field.getBase().endsWith(FIELD_AVERAGE_DIVISOR_SUFFIX)) {
+ Attribute> attribute = field.getAttribute();
+ GroupingAttribute> newAttribute = new GroupingAttribute<>((Type>) attribute.getData(), new Key(field.getBase()), true);
+ newAttribute.setColumnVisibility(attribute.getColumnVisibility());
+ grouping.add(newAttribute);
+ }
+ }
+ // Create a new group and merge it into the existing groups.
+ Group group = new Group(grouping, count);
+ group.setFieldAggregator(fieldAggregator);
+ group.addDocumentVisibility(document.getColumnVisibility());
+ groups.mergeOrPutGroup(group);
+ }
+ }
+
+ /**
+ * Return a substring of the given str without the given suffix.
+ *
+ * @param str
+ * the string
+ * @param suffix
+ * the suffix
+ * @return the string without the suffix
+ */
+ private String removeSuffix(String str, String suffix) {
+ int suffixLength = suffix.length();
+ return str.substring(0, str.length() - suffixLength);
+ }
+
+ /**
+ * Identify which events in the document are targets for grouping and/or aggregation, and index them.
+ */
+ private void indexDocumentEntries() {
+ for (Map.Entry> entry : document.entrySet()) {
+ Field field = parseField(entry);
+ // The current field is a target for grouping.
+ if (groupFields.contains(field.getBase())) {
+ groupFieldsIndex.index(field);
+ }
+ // The current field is a target for aggregation.
+ if (fieldAggregatorFactory.isFieldToAggregate(field.getBase())) {
+ aggregateFieldsIndex.index(field);
+ }
+ }
+ }
+
+ /**
+ * Identify valid groupings consisting of target group pairs and create/update their corresponding {@link Group} in {@link #currentGroups}.
+ */
+ private void groupEntries() {
+ // If we found any entries for target group fields, identify all valid groupings.
+ if (groupEntriesFound()) {
+ // The groupings combinations that we find. Each combination may only have one Field from a particular target group field, e.g. if doing
+ // #GROUP_BY(AGE,GENDER), a combination set will have at most one AGE field and one GENDER field.
+ List> groupings = new ArrayList<>();
+
+ // If we only have one target grouping field, we do not need to find any group combinations. All events for the given target group field should be
+ // tracked as individual groupings.
+ if (maxGroupSize == 1) {
+ groupFieldsIndex.fields.values().stream().map(Collections::singleton).forEach(groupings::add);
+ } else {
+ // If we have any group field events with grouping contexts and instances, e.g. GENDER.FOO.1, it's possible that we will find direct matches to
+ // other group field events with the same grouping context and instance (a direct match). These should be found first for efficiency purposes.
+ if (groupFieldsIndex.hasFieldsWithPossibleDirectMatch()) {
+ groupings = getGroupingsWithDirectMatches();
+ }
+ // If we have any group field events that do not have a grouping context and instance, e.g. GENDER.1 or GENDER, then each one of those events
+ // should
+ // be combined with each existing group combination, effectively creating cartesian products.
+ if (groupFieldsIndex.hasFieldsWithoutDirectMatch()) {
+ groupings = getGroupingsWithoutDirectMatches(groupings);
+ }
+ }
+
+ // Track each identified grouping.
+ groupings.forEach(this::trackGroup);
+ } else {
+ // If no entries were found for any of the target group fields, create a single 'empty' group that will represent this document in the final
+ // grouping results.
+ trackGroup(Grouping.emptyGrouping());
+ }
+ }
+
+ /**
+ * Identify grouping combinations that are direct matches to each other based on the grouping context and instance of the field events. If we do not find
+ * any direct match at all for a specified target group field, then all events for the group field will be combined.
+ *
+ * @return the direct match combinations
+ */
+ private List> getGroupingsWithDirectMatches() {
+ List> groupings = new ArrayList<>();
+ Set fieldsWithGroupingContextAndInstance = groupFieldsIndex.getFieldsWithPossibleDirectMatch();
+ // If we only saw one field with a grouping context and instance, return a list of singletons with each field event. We cannot create any combinations
+ // at this time.
+ if (fieldsWithGroupingContextAndInstance.size() == 1) {
+ Collection fields = groupFieldsIndex.getFields(fieldsWithGroupingContextAndInstance.iterator().next());
+ fields.stream().map(Collections::singleton).forEach(groupings::add);
+ } else {
+ // If we have more than one target field with a grouping context and instance, determine the correct groupings based off matching the grouping
+ // context and instance where possible with direct 1-to-1 matches, i.e. AGE.FOO.1 is a direct match to GENDER.FOO.1.
+ Multimap,Field> groupingContextAndInstanceToField = HashMultimap.create();
+ for (String fieldName : fieldsWithGroupingContextAndInstance) {
+ Collection fields = groupFieldsIndex.getFields(fieldName);
+ for (Field field : fields) {
+ groupingContextAndInstanceToField.put(Pair.with(field.getGroupingContext(), field.getInstance()), field);
+ }
+ }
+
+ // Sort the entries by the number of direct matches seen for each grouping context-instance pair.
+ SortedSet,Collection>> directMatchesSortedByPrevalence = new TreeSet<>(
+ Comparator.comparingInt((Map.Entry,Collection> left) -> left.getValue().size()).reversed()
+ .thenComparing(Map.Entry::getKey));
+ directMatchesSortedByPrevalence.addAll(groupingContextAndInstanceToField.asMap().entrySet());
+
+ // Map of group target field names to the grouping combinations found for them.
+ Multimap,Set> fieldsToGroupings = ArrayListMultimap.create();
+ // Tracks the largest size seen for any combination of direct matches for target group fields.
+ Map fieldToLargestGroupingSize = new HashMap<>();
+
+ for (Map.Entry,Collection> entry : directMatchesSortedByPrevalence) {
+ Collection fields = entry.getValue();
+ SortedSet groupingFields = new TreeSet<>();
+ boolean keep = false;
+ for (Field field : fields) {
+ groupingFields.add(field.getBase());
+ // If we have seen this field before associated with another grouping context and instance, only keep this grouping if it is the same size
+ // as the largest grouping we've seen for the field.
+ if (fieldToLargestGroupingSize.containsKey(field.getBase())) {
+ if (fields.size() == fieldToLargestGroupingSize.get(field.getBase())) {
+ keep = true;
+ }
+ } else {
+ // If this is the first time we are seeing this field, then we have found the largest batch size for the grouping that this field is in.
+ // Automatically keep this grouping.
+ fieldToLargestGroupingSize.put(field.getBase(), fields.size());
+ keep = true;
+ }
+ }
+ if (keep) {
+ fieldsToGroupings.put(groupingFields, Sets.newHashSet(fields));
+ }
+ }
+
+ // Now that we've found the largest direct match combinations for each target group field, we need to effectively create cartesian products between
+ // each combination. For instance, given the following grouping combinations resulting from #GROUP_BY(AGE,GENDER,RECORD_ID,RECORD_TEXT,BUILDING):
+ //
+ // {AGE,GENDER} => [{"20", "MALE"},{"10", "FEMALE"}]
+ // {RECORD_ID,RECORD_TEXT} => [{"123", "Summary"}]
+ // {BUILDING} => [{West},{East}]
+ //
+ // We want to generate the following combinations:
+ // {"20","MALE","123","Summary","West"}
+ // {"20","MALE","123","Summary","East"}
+ // {"10","FEMALE","123","Summary","West"}
+ // {"10","FEMALE","123","Summary","East"}
+ for (SortedSet fields : fieldsToGroupings.keySet()) {
+ Collection> currentGroupings = fieldsToGroupings.get(fields);
+ if (groupings.isEmpty()) {
+ groupings.addAll(currentGroupings);
+ } else {
+ List> newGroupings = new ArrayList<>();
+ for (Set oldGrouping : groupings) {
+ for (Set currentGrouping : currentGroupings) {
+ Set newGrouping = new HashSet<>(oldGrouping);
+ newGrouping.addAll(currentGrouping);
+ newGroupings.add(newGrouping);
+ }
+ }
+ groupings = newGroupings;
+ }
+ }
+ }
+ return groupings;
+ }
+
+ /**
+ * Combine each field event for target group fields that do not have both a grouping context and instance to any previously found grouping combinations.
+ *
+ * @param prevGroupings
+ * the combinations that have been found thus far
+ * @return the updated grouping combinations
+ */
+ private List> getGroupingsWithoutDirectMatches(List> prevGroupings) {
+ List> groupings = new ArrayList<>(prevGroupings);
+ for (String fieldName : groupFieldsIndex.getFieldsWithoutDirectMatch()) {
+ Collection fields = groupFieldsIndex.getFields(fieldName);
+ // If there are no previous grouping combinations, add each field event as a singular combination.
+ if (groupings.isEmpty()) {
+ for (Field field : fields) {
+ groupings.add(Sets.newHashSet(field));
+ }
+ } else {
+ // Effectively create cartesian products of each previously seen grouping combination and each field event for the current target event field.
+ // For instance, if we have the previous combination [{"20","MALE"},{"10","FEMALE"}] and the field events {"A","B","C"}, we want to generate
+ // the following combinations:
+ //
+ // {"20","MALE", "A"}
+ // {"20","MALE", "B"}
+ // {"20","MALE", "C"}
+ // {"10","FEMALE", "A"}
+ // {"10","FEMALE", "B"}
+ // {"10","FEMALE", "C"}
+ List> newGroupings = new ArrayList<>();
+ for (Set oldGrouping : groupings) {
+ for (Field field : fields) {
+ Set newGrouping = new HashSet<>(oldGrouping);
+ newGrouping.add(field);
+ newGroupings.add(newGrouping);
+ }
+ }
+ groupings = newGroupings;
+ }
+ }
+ return groupings;
+ }
+
+ /**
+ * Track the groups identified by the given field event combinations.
+ *
+ * @param groupedFields
+ * the group combination
+ */
+ private void trackGroup(Collection groupedFields) {
+ // The grouping context-instance pairs seen for all grouping keys generated in this method.
+ Set> groupingContextAndInstances = new HashSet<>();
+ // The set of 'keys' that are used to identify individual distinct groupings.
+ List groupings = new ArrayList<>();
+ // It is possible for a field event in a grouping combination to have a multi-value attribute. If this occurs, we must once again create cartesian
+ // products between all the values of the attribute of each field.
+ for (Field field : groupedFields) {
+ // Track the grouping context-instance pair. This is required for us to be able to find direct matches later when aggregating.
+ if (field.hasGroupingContext() && field.hasInstance()) {
+ groupingContextAndInstances.add(Pair.with(field.getGroupingContext(), field.getInstance()));
+ }
+ // If we have no grouping keys yet, create keys consisting of each value of the current field.
+ if (groupings.isEmpty()) {
+ for (Attribute> attribute : field.getAttributes()) {
+ GroupingAttribute> copy = createCopyWithKey(attribute, field.getBase());
+ groupings.add(new Grouping(copy));
+ }
+ } else {
+ // Otherwise, create the cartesian product between the current field's value and each existing key.
+ List newGroupings = new ArrayList<>();
+ for (Attribute> attribute : field.getAttributes()) {
+ GroupingAttribute> copy = createCopyWithKey(attribute, field.getBase());
+ for (Grouping grouping : groupings) {
+ Grouping groupingCopy = new Grouping(grouping);
+ groupingCopy.add(copy);
+ newGroupings.add(groupingCopy);
+ }
+ }
+ groupings = newGroupings;
+ }
+ }
+
+ // Track which grouping context-instance pairs we have seen for each grouping key.
+ for (Pair groupingContextAndInstance : groupingContextAndInstances) {
+ this.groupingContextAndInstancesSeenForGroups.putAll(groupingContextAndInstance, groupings);
+ }
+
+ // Now we can create/update groups in currentGroups for each grouping key.
+ groupings.forEach(this::trackGroup);
+ }
+
+ /**
+ * Create/update the group for the given grouping.
+ *
+ * @param grouping
+ * the grouping to track
+ */
+ private void trackGroup(Grouping grouping) {
+ // Get the group.
+ Group group = currentGroups.getGroup(grouping);
+ // Create a group for the grouping if one does not already exist.
+ if (group == null) {
+ group = new Group(grouping);
+ group.setFieldAggregator(fieldAggregatorFactory.newInstance());
+ currentGroups.putGroup(group);
+ }
+ // Add the visibilities of each attribute in the grouping for combination later, and increment the count for how many times this distinct
+ // grouping was seen.
+ group.addAttributeVisibilities(grouping);
+ group.incrementCount();
+ group.addDocumentVisibility(document.getColumnVisibility());
+ }
+
+ private GroupingAttribute> createCopyWithKey(Attribute> attribute, String key) {
+ Type> type = ((TypeAttribute>) attribute).getType();
+ GroupingAttribute> newAttribute = new GroupingAttribute<>(type, new Key(key), true);
+ newAttribute.setColumnVisibility(attribute.getColumnVisibility());
+ return newAttribute;
+ }
+
+ /**
+ * Aggregate all qualifying events that are from target aggregation fields.
+ */
+ private void aggregateEntries() {
+ // Groupings were found in the document. Aggregate entries according to their association based on each entry's grouping context and instance.
+ if (groupEntriesFound()) {
+ // If we have any target events for aggregation that have a grouping context and instance, e.g. AGE.FOO.1, attempt to find groups that have matching
+ // grouping context and instance pairs, and aggregate the events into those groups only. If we do not find any direct match at all for a specified
+ // aggregation field, then all events for the aggregation field will be aggregated into each group.
+ if (aggregateFieldsIndex.hasFieldsWithPossibleDirectMatch()) {
+ // Attempt to find a direct match for the current aggregation target field.
+ for (String fieldName : aggregateFieldsIndex.fieldToFieldsByGroupingContextAndInstance.keySet()) {
+ Multimap,Field> groupingContextAndInstanceToFields = aggregateFieldsIndex.fieldToFieldsByGroupingContextAndInstance
+ .get(fieldName);
+ Set> aggregatePairs = groupingContextAndInstanceToFields.keySet();
+ Set> groupPairs = this.groupingContextAndInstancesSeenForGroups.keySet();
+ // A group and an aggregation event is considered to be a direct match if and only if the group contains any event that has the same
+ // grouping context and instance as the aggregation event.
+ Set> directMatches = Sets.intersection(aggregatePairs, groupPairs);
+ // If we have any direct matches, then only aggregate the direct matches into the groups where we saw a direct match.
+ if (!directMatches.isEmpty()) {
+ for (Pair directMatch : directMatches) {
+ for (Grouping grouping : this.groupingContextAndInstancesSeenForGroups.get(directMatch)) {
+ Group group = currentGroups.getGroup(grouping);
+ Collection fields = groupingContextAndInstanceToFields.get(directMatch);
+ group.aggregateAll(fields);
+ }
+ }
+ } else {
+ // Otherwise, aggregate all events for this field into all groups.
+ Collection fields = aggregateFieldsIndex.getFields(fieldName);
+ currentGroups.aggregateToAllGroups(fields);
+ }
+ }
+ }
+ // If there are any target aggregation events that do not have a grouping context, e.g. AGE or AGE.1, then all target aggregation events should be
+ // aggregated into all groups.
+ if (aggregateFieldsIndex.hasFieldsWithoutDirectMatch()) {
+ for (String fieldName : aggregateFieldsIndex.fieldsWithoutDirectMatch) {
+ Collection fields = aggregateFieldsIndex.getFields(fieldName);
+ currentGroups.aggregateToAllGroups(fields);
+ }
+ }
+ } else {
+ // No groupings were found in the document. In this case, we will consider this document to contain a placeholder 'empty' grouping, and aggregate
+ // all aggregation entries to the empty grouping.
+ Group group = currentGroups.getGroup(Grouping.emptyGrouping());
+ // Aggregate all aggregate entries to the grouping.
+ Multimap fields = aggregateFieldsIndex.fields;
+ for (String field : fields.keySet()) {
+ group.aggregateAll(field, fields.get(field));
+ }
+ }
+ }
+
+ private boolean groupEntriesFound() {
+ return !groupFieldsIndex.isEmpty();
+ }
+
+ /**
+ * Parses the relevant information from the given entry and returns a {@link Field} that contains the field name, group, instance, and the value. It is
+ * assumed that the entry's key will have the format {@code }, {@code .} or {@code ....}.
+ *
+ * @param entry
+ * the document entry
+ * @return the field entry.
+ */
+ private Field parseField(Map.Entry> entry) {
+ String field = entry.getKey();
+ String name = field;
+ String groupingContext = null;
+ String instance = null;
+
+ int firstPeriod = field.indexOf('.');
+ // If the field name contains at least one period, the field's format is either . or ....
+ if (firstPeriod != -1) {
+ // The field name is everything before the first period.
+ name = field.substring(0, firstPeriod);
+
+ int secondPeriod = field.indexOf(".", firstPeriod + 1);
+ // If a second period is present, we know that field's format is ....
+ if (secondPeriod != -1) {
+ // Parse the group from the substring directly following the name.
+ groupingContext = field.substring(firstPeriod + 1, secondPeriod);
+ // Parse the instance from the substring after the last period.
+ instance = field.substring(field.lastIndexOf(".") + 1);
+ } else {
+ // If there is no second period present, the field's format is ..
+ instance = field.substring(firstPeriod + 1);
+ }
+ }
+
+ // Map the field name to the root model name. This ensures that even if we're grouping fields that can be seen with different model names, e.g. AG, ETA,
+ // and AGE, that the same root name will be used across the board to ensure that they're treated as from the same target group/aggregation field.
+ name = getMappedFieldName(name);
+
+ return new Field(name, groupingContext, instance, entry.getValue());
+ }
+
+ /**
+ * Get the corresponding model mapping for the field. If model mappings have not been provided, the original field will be returned.
+ *
+ * @param field
+ * the field to map
+ * @return the mapped field
+ */
+ private String getMappedFieldName(String field) {
+ return reverseModelMappings.getOrDefault(field, field);
+ }
+
+ /**
+ * This class maintains useful indexes that will be used for determining direct and non-direct matches when grouping and aggregating.
+ */
+ private static class FieldIndex {
+
+ // Map of field names to their entries.
+ private final Multimap fields = ArrayListMultimap.create();
+ // The set of fields with possible direct matches.
+ private final Set fieldsWithPossibleDirectMatch = new HashSet<>();
+ // The set of fields with no direct matches.
+ private final Set fieldsWithoutDirectMatch = new HashSet<>();
+ // Map of field names to Multimaps of grouping contexts to entries.
+ private final Map,Field>> fieldToFieldsByGroupingContextAndInstance = new HashMap<>();
+ // Whether to accept entries that have null attributes for indexing.
+ private final boolean allowNullAttributes;
+
+ private FieldIndex(boolean allowNullAttributes) {
+ this.allowNullAttributes = allowNullAttributes;
+ }
+
+ /**
+ * Index the given {@link Field}. If {@link #allowNullAttributes} is set to false and the given field has a null attribute, it will not be indexed.
+ *
+ * @param field
+ * the field to index
+ */
+ public void index(Field field) {
+ // Check if we can index this field.
+ if (field.getAttribute() != null || allowNullAttributes) {
+ fields.put(field.getBase(), field);
+ // If the field has a grouping context and instance, it's possible that it may have a direct match. Index the field and its grouping
+ // context-instance pair.
+ if (field.hasGroupingContext() && field.hasInstance()) {
+ fieldsWithPossibleDirectMatch.add(field.getBase());
+ Multimap,Field> groupingContextAndInstanceToField = fieldToFieldsByGroupingContextAndInstance.get(field.getBase());
+ if (groupingContextAndInstanceToField == null) {
+ groupingContextAndInstanceToField = HashMultimap.create();
+ fieldToFieldsByGroupingContextAndInstance.put(field.getBase(), groupingContextAndInstanceToField);
+ }
+ groupingContextAndInstanceToField.put(Pair.with(field.getGroupingContext(), field.getInstance()), field);
+ } else {
+ // Otherwise, the field will have no direct matches.
+ fieldsWithoutDirectMatch.add(field.getBase());
+ }
+ }
+ }
+
+ public Multimap getFields() {
+ return fields;
+ }
+
+ public Collection getFields(String field) {
+ return fields.get(field);
+ }
+
+ public Set getFieldsWithPossibleDirectMatch() {
+ return fieldsWithPossibleDirectMatch;
+ }
+
+ public boolean hasFieldsWithPossibleDirectMatch() {
+ return !fieldsWithPossibleDirectMatch.isEmpty();
+ }
+
+ public boolean hasFieldsWithoutDirectMatch() {
+ return !fieldsWithoutDirectMatch.isEmpty();
+ }
+
+ public Set getFieldsWithoutDirectMatch() {
+ return fieldsWithoutDirectMatch;
+ }
+
+ public boolean isEmpty() {
+ return fields.isEmpty();
+ }
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/Field.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/Field.java
new file mode 100644
index 0000000000..7af6af2c16
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/Field.java
@@ -0,0 +1,124 @@
+package datawave.query.common.grouping;
+
+import java.util.Collections;
+import java.util.Objects;
+import java.util.Set;
+
+import org.apache.commons.lang.builder.ToStringBuilder;
+
+import datawave.query.attributes.Attribute;
+import datawave.query.attributes.Attributes;
+
+/**
+ * Represents an entry from a document with a field name broken down into its name, group, and instance, and the entry's attribute.
+ */
+class Field {
+
+ private final String base;
+ private final String groupingContext;
+ private final String instance;
+ private final Attribute> attribute;
+ private final Set> attributes;
+
+ public Field(String base, String groupingContext, String instance, Attribute> attribute) {
+ this.base = base;
+ this.groupingContext = groupingContext;
+ this.instance = instance;
+ this.attribute = attribute;
+
+ if (attribute instanceof Attributes) {
+ this.attributes = ((Attributes) attribute).getAttributes();
+ } else {
+ this.attributes = Collections.singleton(attribute);
+ }
+ }
+
+ /**
+ * Return the field base.
+ *
+ * @return the field base
+ */
+ public String getBase() {
+ return base;
+ }
+
+ /**
+ * Return whether this field has a grouping context as part of its name.
+ *
+ * @return true if this field has a group, or false otherwise
+ */
+ public boolean hasGroupingContext() {
+ return groupingContext != null;
+ }
+
+ /**
+ * Return the field's group, or null if the field does not have a group.
+ *
+ * @return the group
+ */
+ public String getGroupingContext() {
+ return groupingContext;
+ }
+
+ /**
+ * Return the field's instance, or null if the field does not have an instance.
+ *
+ * @return the instance
+ */
+ public String getInstance() {
+ return instance;
+ }
+
+ /**
+ * Return whether this field has an instance as part of its name.
+ *
+ * @return true if this field has an instance, or false otherwise
+ */
+ public boolean hasInstance() {
+ return instance != null;
+ }
+
+ /**
+ * Return this field's attribute
+ *
+ * @return the attribute
+ */
+ public Attribute> getAttribute() {
+ return attribute;
+ }
+
+ /**
+ * A convenience method for retrieving all attributes for this {@link Field}, particularly useful when dealing with a {@link Field} that was created with a
+ * multi-value attribute. If the originating attribute was not multi-value, then the set will consist only of the same attribute returned by
+ * {@link #getAttribute()}.
+ *
+ * @return all attributes, or same attribute as returned by {@link #getAttribute()} if the originating attribute was not multi-value
+ */
+ public Set> getAttributes() {
+ return attributes;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ Field field = (Field) o;
+ return Objects.equals(base, field.base) && Objects.equals(groupingContext, field.groupingContext) && Objects.equals(instance, field.instance)
+ && Objects.equals(attributes, field.attributes);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(base, groupingContext, instance, attributes);
+ }
+
+ @Override
+ public String toString() {
+ return new ToStringBuilder(this).append("base", base).append("groupingContext", groupingContext).append("instance", instance)
+ .append("attributes", attributes).toString();
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/FieldAggregator.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/FieldAggregator.java
new file mode 100644
index 0000000000..304dda525b
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/FieldAggregator.java
@@ -0,0 +1,404 @@
+package datawave.query.common.grouping;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang.builder.ToStringBuilder;
+
+import datawave.query.attributes.Attribute;
+
+/**
+ * This class provides functionality to aggregate values for specified target fields using specified aggregation operations.
+ */
+public class FieldAggregator {
+
+ private final Map>> aggregatorMap;
+
+ public FieldAggregator() {
+ aggregatorMap = new HashMap<>();
+ }
+
+ public FieldAggregator(Set sumFields, Set maxFields, Set minFields, Set countFields, Set averageFields) {
+ this();
+ populateAggregators(sumFields, SumAggregator::new);
+ populateAggregators(maxFields, MaxAggregator::new);
+ populateAggregators(minFields, MinAggregator::new);
+ populateAggregators(countFields, CountAggregator::new);
+ populateAggregators(averageFields, AverageAggregator::new);
+ }
+
+ /**
+ * Add an aggregator supplied by the given constructor for each of the given fields to the aggregator map.
+ *
+ * @param fields
+ * the fields
+ * @param constructor
+ * the aggregator constructor
+ */
+ private void populateAggregators(Set fields, Function> constructor) {
+ if (fields != null) {
+ for (String field : fields) {
+ Aggregator> aggregator = constructor.apply(field);
+ Map> map = aggregatorMap.get(field);
+ if (map == null) {
+ map = new HashMap<>();
+ this.aggregatorMap.put(field, map);
+ }
+ map.put(aggregator.getOperation(), aggregator);
+ }
+ }
+ }
+
+ /**
+ * Aggregate the given field to all relevant aggregators.
+ *
+ * @param field
+ * the field to aggregate
+ */
+ public void aggregate(Field field) {
+ if (aggregatorMap.containsKey(field.getBase())) {
+ Collection> aggregators = this.aggregatorMap.get(field.getBase()).values();
+ for (Attribute> attribute : field.getAttributes()) {
+ aggregators.forEach(aggregator -> aggregator.aggregate(attribute));
+ }
+ }
+ }
+
+ /**
+ * Aggregate each of the given fields to all relevant aggregators.
+ *
+ * @param fields
+ * the fields to aggregate
+ */
+ public void aggregateAll(Collection fields) {
+ fields.forEach(this::aggregate);
+ }
+
+ /**
+ * Aggregate each of the given fields to all relevant aggregators for the given field. This is more efficient than {@link #aggregateAll(Collection)} when
+ * you have a collection of fields for the same base field.
+ *
+ * @param field
+ * the field the base field name
+ * @param fields
+ * the fields to aggregate
+ */
+ public void aggregateAll(String field, Collection fields) {
+ if (aggregatorMap.containsKey(field)) {
+ List> attributes = fields.stream().map(Field::getAttribute).collect(Collectors.toList());
+ Collection> aggregators = this.aggregatorMap.get(field).values();
+ for (Aggregator> aggregator : aggregators) {
+ aggregator.aggregateAll(attributes);
+ }
+ }
+ }
+
+ /**
+ * Return the map of fields to their aggregators.
+ *
+ * @return the aggregator map.
+ */
+ public Map>> getAggregatorMap() {
+ return aggregatorMap;
+ }
+
+ public Aggregator> getAggregator(String field, AggregateOperation operation) {
+ Map> map = aggregatorMap.get(field);
+ if (map != null) {
+ return map.get(operation);
+ }
+ return null;
+ }
+
+ /**
+ * Return the set of all fields being aggregated.
+ *
+ * @return the fields
+ */
+ public Collection getFieldsToAggregate() {
+ return aggregatorMap.keySet();
+ }
+
+ /**
+ * Merge the given aggregator into this aggregated fields.
+ *
+ * @param aggregator
+ * the aggregator to merge.
+ */
+ public void mergeAggregator(Aggregator> aggregator) {
+ if (aggregator.hasAggregation()) {
+ Map> map = aggregatorMap.computeIfAbsent(aggregator.getField(), k -> new HashMap<>());
+ if (map.containsKey(aggregator.getOperation())) {
+ Aggregator> currentAggregator = map.get(aggregator.getOperation());
+ if (currentAggregator.hasAggregation()) {
+ currentAggregator.merge(aggregator);
+ } else {
+ map.put(aggregator.getOperation(), aggregator);
+ }
+ } else {
+ map.put(aggregator.getOperation(), aggregator);
+ }
+ }
+
+ }
+
+ /**
+ * Merge the given aggregated fields into this aggregated fields.
+ *
+ * @param other
+ * the aggregated fields to merge in
+ */
+ public void merge(FieldAggregator other) {
+ for (String field : other.aggregatorMap.keySet()) {
+ // If we already have aggregators for this field, merge the aggregators for the current field from the other aggregated fields into this one.
+ if (this.aggregatorMap.containsKey(field)) {
+ Map> thisMap = this.aggregatorMap.get(field);
+ Map> otherMap = other.aggregatorMap.get(field);
+ for (AggregateOperation operation : otherMap.keySet()) {
+ if (thisMap.containsKey(operation)) {
+ Aggregator> currentAggregator = thisMap.get(operation);
+ Aggregator> otherAggregator = otherMap.get(operation);
+ if (currentAggregator.hasAggregation() && otherAggregator.hasAggregation()) {
+ currentAggregator.merge(otherAggregator);
+ } else if (otherAggregator.hasAggregation()) {
+ thisMap.put(operation, otherAggregator);
+ }
+ } else {
+ thisMap.put(operation, otherMap.get(operation));
+ }
+ }
+ } else {
+ // If no aggregators exist in this aggregated fields for the current field, add all aggregators for it.
+ this.aggregatorMap.put(field, new HashMap<>(other.aggregatorMap.get(field)));
+ }
+ }
+ }
+
+ @Override
+ public String toString() {
+ return aggregatorMap.toString();
+ }
+
+ /**
+ * A factory that will generate new {@link FieldAggregator} with the designated sum, max, min, count, and average aggregation field targets.
+ */
+ public static class Factory {
+
+ private final Set sumFields;
+ private final Set maxFields;
+ private final Set minFields;
+ private final Set countFields;
+ private final Set averageFields;
+ private final Set allFields;
+
+ public Factory() {
+ this.sumFields = new HashSet<>();
+ this.maxFields = new HashSet<>();
+ this.minFields = new HashSet<>();
+ this.countFields = new HashSet<>();
+ this.averageFields = new HashSet<>();
+ this.allFields = new HashSet<>();
+ }
+
+ /**
+ * Set the fields for which to find the aggregated sum.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withSumFields(Set fields) {
+ addFields(this.sumFields, fields);
+ return this;
+ }
+
+ /**
+ * Set the fields for which to find the aggregated sum.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withSumFields(String... fields) {
+ addFields(this.sumFields, fields);
+ return this;
+ }
+
+ /**
+ * Set the fields for which to find the aggregated max.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withMaxFields(Set fields) {
+ addFields(this.maxFields, fields);
+ return this;
+ }
+
+ /**
+ * Set the fields for which to find the aggregated max.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withMaxFields(String... fields) {
+ addFields(this.maxFields, fields);
+ return this;
+ }
+
+ /**
+ * Set the fields for which to find the aggregated min.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withMinFields(Set fields) {
+ addFields(this.minFields, fields);
+ return this;
+ }
+
+ /**
+ * Set the fields for which to find the aggregated min.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withMinFields(String... fields) {
+ addFields(this.minFields, fields);
+ return this;
+ }
+
+ /**
+ * Set the fields for which to find the total number of times seen.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withCountFields(Set fields) {
+ addFields(this.countFields, fields);
+ return this;
+ }
+
+ /**
+ * Set the fields for which to find the aggregated count.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withCountFields(String... fields) {
+ addFields(this.countFields, fields);
+ return this;
+ }
+
+ /**
+ * Set the fields for which to find the aggregated average.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withAverageFields(Set fields) {
+ addFields(this.averageFields, fields);
+ return this;
+ }
+
+ /**
+ * Set the fields for which to find the aggregated average.
+ *
+ * @param fields
+ * the fields
+ * @return this factory
+ */
+ public Factory withAverageFields(String... fields) {
+ addFields(this.averageFields, fields);
+ return this;
+ }
+
+ /**
+ * Add the given fields into the given set.
+ *
+ * @param set
+ * the set to add the fields to
+ * @param fields
+ * the fields to add
+ */
+ private void addFields(Set set, Collection fields) {
+ if (fields != null) {
+ set.addAll(fields);
+ allFields.addAll(fields);
+ }
+ }
+
+ private void addFields(Set set, String... fields) {
+ addFields(set, Arrays.asList(fields));
+ }
+
+ /**
+ * Return a new {@link FieldAggregator} with the configured target aggregation fields.
+ *
+ * @return a new {@link FieldAggregator} instance
+ */
+ public FieldAggregator newInstance() {
+ return hasFieldsToAggregate() ? new FieldAggregator(sumFields, maxFields, minFields, countFields, averageFields) : new FieldAggregator();
+ }
+
+ /**
+ * Return whether this factory has any target aggregation fields set.
+ *
+ * @return true if this factory has any target aggregation fields, or false otherwise
+ */
+ public boolean hasFieldsToAggregate() {
+ return !allFields.isEmpty();
+ }
+
+ /**
+ * Return whether the given field matches a target aggregation field in this factory.
+ *
+ * @param field
+ * the field
+ * @return true if the given field is a target for aggregation, or false otherwise
+ */
+ public boolean isFieldToAggregate(String field) {
+ return allFields.contains(field);
+ }
+
+ @Override
+ public String toString() {
+ return new ToStringBuilder(this).append("sumFields", sumFields).append("maxFields", maxFields).append("minFields", minFields)
+ .append("countFields", countFields).append("averageFields", averageFields).append("allFields", allFields).toString();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ Factory factory = (Factory) o;
+ return Objects.equals(sumFields, factory.sumFields) && Objects.equals(maxFields, factory.maxFields) && Objects.equals(minFields, factory.minFields)
+ && Objects.equals(countFields, factory.countFields) && Objects.equals(averageFields, factory.averageFields)
+ && Objects.equals(allFields, factory.allFields);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(sumFields, maxFields, minFields, countFields, averageFields, allFields);
+ }
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/Group.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/Group.java
new file mode 100644
index 0000000000..dc40eee9bd
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/Group.java
@@ -0,0 +1,168 @@
+package datawave.query.common.grouping;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.accumulo.core.security.ColumnVisibility;
+import org.apache.commons.lang.builder.ToStringBuilder;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+
+/**
+ * Represents a grouping of values for fields specified via the #GROUP_BY functionality, with information about the total number of times the grouping was seen,
+ * values for target aggregation fields that were matched to this group, and the different column visibilities seen.
+ */
+public class Group {
+
+ /**
+ * The distinct set of values that represent this grouping.
+ */
+ private final ImmutableGrouping grouping;
+
+ /**
+ * The different column visibilities seen for each attribute that makes up the grouping.
+ */
+ private final Multimap,ColumnVisibility> attributeVisibilities = HashMultimap.create();
+
+ /**
+ * The column visibilities for each document that contributed entries to this grouping.
+ */
+ private final Set documentVisibilities = new HashSet<>();
+
+ /**
+ * The total number of times the distinct grouping was seen.
+ */
+ private int count;
+
+ /**
+ * The aggregated values for any specified fields to aggregate.
+ */
+ private FieldAggregator fieldAggregator = new FieldAggregator();
+
+ public Group(Grouping grouping) {
+ this(grouping, 0);
+ }
+
+ public Group(Grouping grouping, int count) {
+ this.grouping = new ImmutableGrouping(grouping);
+ addAttributeVisibilities(this.grouping);
+ this.count = count;
+ }
+
+ /**
+ * Returns the distinct set of values that represent this grouping.
+ *
+ * @return the grouping
+ */
+ public Grouping getGrouping() {
+ return grouping;
+ }
+
+ /**
+ * Add the column visibilities from each of the given attributes to the set of attribute visibilities for this group.
+ *
+ * @param grouping
+ * the attributes to add visibilities from
+ */
+ public void addAttributeVisibilities(Grouping grouping) {
+ for (GroupingAttribute> attribute : grouping) {
+ attributeVisibilities.put(attribute, attribute.getColumnVisibility());
+ }
+ }
+
+ /**
+ * Return the set of column visibilities seen for the given attribute.
+ *
+ * @param attribute
+ * the attribute
+ * @return the column visibilities seen for the given attributes
+ */
+ public Collection getVisibilitiesForAttribute(GroupingAttribute> attribute) {
+ return attributeVisibilities.get(attribute);
+ }
+
+ /**
+ * Add the column visibility to the set of visibilities of documents for which we have seen the grouping of this group in.
+ *
+ * @param columnVisibility
+ * the visibility to add
+ */
+ public void addDocumentVisibility(ColumnVisibility columnVisibility) {
+ this.documentVisibilities.add(columnVisibility);
+ }
+
+ /**
+ * Return the set of all distinct column visibilities from documents that we have seen this group in.
+ *
+ * @return the document column visibilities
+ */
+ public Set getDocumentVisibilities() {
+ return documentVisibilities;
+ }
+
+ /**
+ * Increment the number of times we have seen this grouping by one.
+ */
+ public void incrementCount() {
+ this.count++;
+ }
+
+ /**
+ * Returns the number of times we have seen this grouping.
+ *
+ * @return the number of times we've seen this group.
+ */
+ public int getCount() {
+ return count;
+ }
+
+ /**
+ * Returns the aggregated fields for this group.
+ *
+ * @return the aggregated fields.
+ */
+ public FieldAggregator getFieldAggregator() {
+ return fieldAggregator;
+ }
+
+ /**
+ * Set the aggregated fields for this group.
+ *
+ * @param fieldAggregator
+ * the aggregated fields to set
+ */
+ public void setFieldAggregator(FieldAggregator fieldAggregator) {
+ this.fieldAggregator = fieldAggregator;
+ }
+
+ public void aggregateAll(Collection fields) {
+ fieldAggregator.aggregateAll(fields);
+ }
+
+ public void aggregateAll(String field, Collection fields) {
+ fieldAggregator.aggregateAll(field, fields);
+ }
+
+ /**
+ * Merge the given group into this group. The attribute visibilities and document visibilities from the other group will be added into this group. The count
+ * for this group will be incremented by the count of the other group. The aggregated fields of the other group will be merged into the aggregated fields of
+ * this group.
+ *
+ * @param other
+ * the group to merge
+ */
+ public void merge(Group other) {
+ this.attributeVisibilities.putAll(other.attributeVisibilities);
+ this.documentVisibilities.addAll(other.documentVisibilities);
+ this.count += other.count;
+ this.fieldAggregator.merge(other.fieldAggregator);
+ }
+
+ @Override
+ public String toString() {
+ return new ToStringBuilder(this).append("attributes", grouping).append("attributeVisibilities", attributeVisibilities)
+ .append("documentVisibilities", documentVisibilities).append("count", count).append("aggregatedFields", fieldAggregator).toString();
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupFields.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupFields.java
new file mode 100644
index 0000000000..196db28e23
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupFields.java
@@ -0,0 +1,466 @@
+package datawave.query.common.grouping;
+
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang.StringUtils;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonValue;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Sets;
+
+import datawave.query.Constants;
+import datawave.query.jexl.JexlASTHelper;
+
+/**
+ * Represents a set of fields that have been specified within a {@code #groupby()} function, as well as any fields specified in the functions {@code #sum()},
+ * {@code #count()}, {@code #average()}, {@code #min()}, and {@code #max()} that should be used when preforming a group-by operation on documents. This class
+ * can easily be captured as a parameter string using {@link GroupFields#toString()}, and transformed back into a {@link GroupFields} instance via
+ * {@link GroupFields#from(String)}.
+ */
+public class GroupFields implements Serializable {
+
+ private static final String GROUP = "GROUP";
+ private static final String SUM = "SUM";
+ private static final String COUNT = "COUNT";
+ private static final String AVERAGE = "AVERAGE";
+ private static final String MIN = "MIN";
+ private static final String MAX = "MAX";
+ private static final String MODEL_MAP = "REVERSE_MODEL_MAP";
+
+ private Set groupByFields = new HashSet<>();
+ private Set sumFields = new HashSet<>();
+ private Set countFields = new HashSet<>();
+ private Set averageFields = new HashSet<>();
+ private Set minFields = new HashSet<>();
+ private Set maxFields = new HashSet<>();
+ private Map reverseModelMap = new HashMap<>();
+
+ /**
+ * Returns a new {@link GroupFields} parsed the given string. The string is expected to have the format returned by {@link GroupFields#toString()}, but may
+ * also be a comma-delimited string of fields to group-by to support backwards-compatibility with the legacy format. See below for certain edge cases:
+ *
+ * - Given null, null will be returned.
+ * - Given an empty or blank string, an empty {@link GroupFields} will be returned.
+ * - Given a comma-delimited list of fields, e.g {@code AGE,GENDER}, a {@link GroupFields} with the fields set as the group-by fields will be
+ * returned.
+ *
+ *
+ * @param string
+ * the string to parse
+ * @return the parsed {@link GroupFields}
+ */
+ @JsonCreator
+ public static GroupFields from(String string) {
+ if (string == null) {
+ return null;
+ }
+
+ // Strip whitespaces.
+ string = StringUtils.deleteWhitespace(string);
+
+ GroupFields groupFields = new GroupFields();
+ if (!string.isEmpty()) {
+ // The string contains group fields in the latest formatting GROUP(field,...)...
+ if (string.contains(Constants.LEFT_PAREN)) {
+ // Individual elements are separated by a pipe.
+ String[] elements = StringUtils.split(string, Constants.PIPE);
+
+ // Each element starts NAME().
+ for (String element : elements) {
+ int leftParen = element.indexOf(Constants.LEFT_PAREN);
+ int rightParen = element.length() - 1;
+ String name = element.substring(0, leftParen);
+ String elementContents = element.substring(leftParen + 1, rightParen);
+ switch (name) {
+ case GROUP:
+ groupFields.groupByFields = parseSet(elementContents);
+ break;
+ case SUM:
+ groupFields.sumFields = parseSet(elementContents);
+ break;
+ case COUNT:
+ groupFields.countFields = parseSet(elementContents);
+ break;
+ case AVERAGE:
+ groupFields.averageFields = parseSet(elementContents);
+ break;
+ case MIN:
+ groupFields.minFields = parseSet(elementContents);
+ break;
+ case MAX:
+ groupFields.maxFields = parseSet(elementContents);
+ break;
+ case MODEL_MAP:
+ groupFields.reverseModelMap = parseMap(elementContents);
+ break;
+ default:
+ throw new IllegalArgumentException("Invalid element " + name);
+ }
+ }
+ } else {
+ // Otherwise, the string may be in the legacy format of a comma-delimited string with group-fields only.
+ String[] groupByFields = StringUtils.split(string, Constants.PARAM_VALUE_SEP);
+ groupFields.setGroupByFields(Sets.newHashSet(groupByFields));
+ }
+ }
+ return groupFields;
+ }
+
+ // Parse a set of fields from the string.
+ private static Set parseSet(String str) {
+ return Sets.newHashSet(StringUtils.split(str, Constants.COMMA));
+ }
+
+ // Parse a map from the given string.
+ private static Map parseMap(String str) {
+ Map map = new HashMap<>();
+ String[] entries = StringUtils.split(str, Constants.COLON);
+ for (String entry : entries) {
+ int equals = entry.indexOf(Constants.EQUALS);
+ String key = entry.substring(0, equals);
+ String value = entry.substring(equals + 1);
+ map.put(key, value);
+ }
+ return map;
+ }
+
+ /**
+ * Return a copy of the given {@link GroupFields}.
+ *
+ * @param other
+ * the other instance to copy
+ * @return the copy
+ */
+ public static GroupFields copyOf(GroupFields other) {
+ if (other == null) {
+ return null;
+ }
+
+ GroupFields copy = new GroupFields();
+ copy.groupByFields = other.groupByFields == null ? null : Sets.newHashSet(other.groupByFields);
+ copy.sumFields = other.sumFields == null ? null : Sets.newHashSet(other.sumFields);
+ copy.countFields = other.countFields == null ? null : Sets.newHashSet(other.countFields);
+ copy.averageFields = other.averageFields == null ? null : Sets.newHashSet(other.averageFields);
+ copy.minFields = other.minFields == null ? null : Sets.newHashSet(other.minFields);
+ copy.maxFields = other.maxFields == null ? null : Sets.newHashSet(other.maxFields);
+ copy.reverseModelMap = other.reverseModelMap == null ? null : Maps.newHashMap(other.reverseModelMap);
+ return copy;
+ }
+
+ /**
+ * Set the fields to group by.
+ *
+ * @param fields
+ * the fields
+ */
+ public void setGroupByFields(Set fields) {
+ this.groupByFields = fields;
+ }
+
+ /**
+ * Set the fields to sum.
+ *
+ * @param fields
+ * the fields
+ */
+ public void setSumFields(Set fields) {
+ this.sumFields = fields;
+ }
+
+ /**
+ * Set the fields to count.
+ *
+ * @param fields
+ * the fields
+ */
+ public void setCountFields(Set fields) {
+ this.countFields = fields;
+ }
+
+ /**
+ * Set the fields to average.
+ *
+ * @param fields
+ * the fields
+ */
+ public void setAverageFields(Set fields) {
+ this.averageFields = fields;
+ }
+
+ /**
+ * Set the fields to find the min of.
+ *
+ * @param fields
+ * the fields
+ */
+ public void setMinFields(Set fields) {
+ this.minFields = fields;
+ }
+
+ /**
+ * Set the fields to find the max of.
+ *
+ * @param fields
+ * the fields
+ */
+ public void setMaxFields(Set fields) {
+ this.maxFields = fields;
+ }
+
+ /**
+ * Return the fields to group by.
+ *
+ * @return the fields
+ */
+ public Set getGroupByFields() {
+ return groupByFields;
+ }
+
+ /**
+ * Return the fields to sum.
+ *
+ * @return the fields
+ */
+ public Set getSumFields() {
+ return sumFields;
+ }
+
+ /**
+ * Return the fields to count.
+ *
+ * @return the fields
+ */
+ public Set getCountFields() {
+ return countFields;
+ }
+
+ /**
+ * Return the fields to average.
+ *
+ * @return the fields
+ */
+ public Set getAverageFields() {
+ return averageFields;
+ }
+
+ /**
+ * Return the fields to find the min of.
+ *
+ * @return the fields
+ */
+ public Set getMinFields() {
+ return minFields;
+ }
+
+ /**
+ * Return the fields to find the max of.
+ *
+ * @return the fields
+ */
+ public Set getMaxFields() {
+ return maxFields;
+ }
+
+ /**
+ * Return whether this {@link GroupFields} has any fields to group by.
+ *
+ * @return true if there are fields to group by, or false otherwise
+ */
+ public boolean hasGroupByFields() {
+ return groupByFields != null && !groupByFields.isEmpty();
+ }
+
+ /**
+ * Return the set of all fields to group by, sum, count, average, and find the min and max of that must be included in projection.
+ *
+ * @return the fields required to be included in projection
+ */
+ public Set getProjectionFields() {
+ Set fields = new HashSet<>();
+ fields.addAll(this.groupByFields);
+ fields.addAll(this.sumFields);
+ fields.addAll(this.countFields);
+ fields.addAll(this.averageFields);
+ fields.addAll(this.minFields);
+ fields.addAll(this.maxFields);
+ fields.addAll(this.reverseModelMap.keySet());
+ fields.addAll(this.reverseModelMap.values());
+ return fields;
+ }
+
+ /**
+ * Deconstruct the identifiers of all fields in this {@link GroupFields}.
+ */
+ public void deconstructIdentifiers() {
+ this.groupByFields = deconstructIdentifiers(this.groupByFields);
+ this.sumFields = deconstructIdentifiers(this.sumFields);
+ this.countFields = deconstructIdentifiers(this.countFields);
+ this.averageFields = deconstructIdentifiers(this.averageFields);
+ this.minFields = deconstructIdentifiers(this.minFields);
+ this.maxFields = deconstructIdentifiers(this.maxFields);
+ }
+
+ // Return a copy of the given set with all identifiers deconstructed.
+ private Set deconstructIdentifiers(Set set) {
+ return set.stream().map(JexlASTHelper::deconstructIdentifier).collect(Collectors.toSet());
+ }
+
+ /**
+ * Modify this {@link GroupFields} to ensure that all sets of fields also include their alternative mappings, and set the model map to the given map.
+ *
+ * @param modelMap
+ * the map to retrieve alternative field mappings from
+ */
+ public void remapFields(Multimap modelMap, Map reverseModelMap) {
+ this.groupByFields = remap(this.groupByFields, modelMap);
+ this.sumFields = remap(this.sumFields, modelMap);
+ this.countFields = remap(this.countFields, modelMap);
+ this.averageFields = remap(this.averageFields, modelMap);
+ this.minFields = remap(this.minFields, modelMap);
+ this.maxFields = remap(this.maxFields, modelMap);
+
+ // Make a copy of the given reverse model map that only contains relevant mappings for efficiency.
+ Set allFields = new HashSet<>();
+ allFields.addAll(groupByFields);
+ allFields.addAll(sumFields);
+ allFields.addAll(countFields);
+ allFields.addAll(averageFields);
+ allFields.addAll(minFields);
+ allFields.addAll(maxFields);
+
+ this.reverseModelMap = new HashMap<>();
+ for (String field : allFields) {
+ if (reverseModelMap.containsKey(field)) {
+ this.reverseModelMap.put(field, reverseModelMap.get(field));
+ }
+ }
+
+ // now we can reduce the fields to only those that map to themselves wrt the reverse model map
+ this.groupByFields = reduce(this.groupByFields, this.reverseModelMap);
+ this.sumFields = reduce(this.sumFields, this.reverseModelMap);
+ this.countFields = reduce(this.countFields, this.reverseModelMap);
+ this.averageFields = reduce(this.averageFields, this.reverseModelMap);
+ this.minFields = reduce(this.minFields, this.reverseModelMap);
+ this.maxFields = reduce(this.maxFields, this.reverseModelMap);
+ }
+
+ private Set reduce(Set set, Map map) {
+ return set.stream().filter(s -> s.equals(map.getOrDefault(s, s))).collect(Collectors.toSet());
+ }
+
+ // Return a copy of the given set with all alternative field mappings included.
+ private Set remap(Set set, Multimap map) {
+ Set newMappings = new HashSet<>(set);
+ for (String field : set) {
+ field = field.toUpperCase();
+ if (map.containsKey(field)) {
+ newMappings.addAll(map.get(field));
+ }
+ }
+ return newMappings;
+ }
+
+ /**
+ * Return the model map. This map will never be null, but may be empty if this {@link GroupFields} was never remapped via
+ * {@link GroupFields#remapFields(Multimap, Map)}.
+ *
+ * @return the reverse model map
+ */
+ public Map getReverseModelMap() {
+ return reverseModelMap;
+ }
+
+ /**
+ * Return a new {@link FieldAggregator.Factory} instance configured with the aggregation fields of this {@link GroupFields}.
+ *
+ * @return a configured {@link FieldAggregator.Factory} instance
+ */
+ public FieldAggregator.Factory getFieldAggregatorFactory() {
+ return new FieldAggregator.Factory().withSumFields(sumFields).withCountFields(countFields).withAverageFields(averageFields).withMinFields(minFields)
+ .withMaxFields(maxFields);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ GroupFields that = (GroupFields) o;
+ return Objects.equals(groupByFields, that.groupByFields) && Objects.equals(sumFields, that.sumFields) && Objects.equals(countFields, that.countFields)
+ && Objects.equals(averageFields, that.averageFields) && Objects.equals(minFields, that.minFields)
+ && Objects.equals(maxFields, that.maxFields) && Objects.equals(reverseModelMap, that.reverseModelMap);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(groupByFields, sumFields, countFields, averageFields, minFields, maxFields, reverseModelMap);
+ }
+
+ @JsonValue
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ writeFormattedSet(sb, GROUP, this.groupByFields);
+ writeFormattedSet(sb, SUM, this.sumFields);
+ writeFormattedSet(sb, COUNT, this.countFields);
+ writeFormattedSet(sb, AVERAGE, this.averageFields);
+ writeFormattedSet(sb, MIN, this.minFields);
+ writeFormattedSet(sb, MAX, this.maxFields);
+ writeFormattedModelMap(sb);
+ return sb.toString();
+ }
+
+ // Write the given set if not empty to the given string builder.
+ private void writeFormattedSet(StringBuilder sb, String name, Set set) {
+ if (!set.isEmpty()) {
+ if (sb.length() > 0) {
+ sb.append(Constants.PIPE);
+ }
+ sb.append(name);
+ sb.append(Constants.LEFT_PAREN);
+ Iterator iterator = set.iterator();
+ while (iterator.hasNext()) {
+ String next = iterator.next();
+ sb.append(next);
+ if (iterator.hasNext()) {
+ sb.append(Constants.COMMA);
+ }
+ }
+ sb.append(Constants.RIGHT_PAREN);
+ }
+ }
+
+ // Write the model map if not empty to the given string builder.
+ private void writeFormattedModelMap(StringBuilder sb) {
+ if (!reverseModelMap.isEmpty()) {
+ if (sb.length() > 0) {
+ sb.append(Constants.PIPE);
+ }
+ sb.append(MODEL_MAP).append(Constants.LEFT_PAREN);
+ Iterator> entryIterator = reverseModelMap.entrySet().iterator();
+ while (entryIterator.hasNext()) {
+ Map.Entry next = entryIterator.next();
+ sb.append(next.getKey()).append(Constants.EQUALS).append(next.getValue());
+ if (entryIterator.hasNext()) {
+ sb.append(Constants.COLON);
+ }
+ }
+ sb.append(Constants.RIGHT_PAREN);
+ }
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/Grouping.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/Grouping.java
new file mode 100644
index 0000000000..736dbb4c0d
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/Grouping.java
@@ -0,0 +1,118 @@
+package datawave.query.common.grouping;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.function.Predicate;
+
+/**
+ * This class represents a {@link HashSet} of {@link GroupingAttribute} elements that maintains a cached hashcode that is calculated once at instantiation, and
+ * subsequently recalculated any time this set is modified. This class is used as a key within maps and as such, the cached hashcode allows us to avoid
+ * calculating the hashcode each time a search operation is performed on the keys of the maps.
+ */
+public class Grouping extends HashSet> {
+
+ public static final Grouping EMPTY_GROUPING = new Grouping(Collections.emptySet());
+
+ public static Grouping emptyGrouping() {
+ return EMPTY_GROUPING;
+ }
+
+ // The cached hashcode.
+ private int cachedHashcode;
+
+ /**
+ * Return a new {@link Grouping} instance containing the elements of the given collection.
+ *
+ * @param collection
+ * the collection
+ * @return the new grouping
+ */
+ public static Grouping of(Collection extends GroupingAttribute>> collection) {
+ return new Grouping(collection);
+ }
+
+ public Grouping() {
+ super();
+ updateCachedHashcode();
+ }
+
+ public Grouping(GroupingAttribute> attribute) {
+ super();
+ add(attribute);
+ updateCachedHashcode();
+ }
+
+ public Grouping(Collection extends GroupingAttribute>> collection) {
+ super(collection);
+ updateCachedHashcode();
+ }
+
+ @Override
+ public boolean add(GroupingAttribute> groupingAttribute) {
+ boolean modified = super.add(groupingAttribute);
+ if (modified) {
+ updateCachedHashcode();
+ }
+ return modified;
+ }
+
+ @Override
+ public boolean addAll(Collection extends GroupingAttribute>> collection) {
+ boolean modified = super.addAll(collection);
+ if (modified) {
+ updateCachedHashcode();
+ }
+ return modified;
+ }
+
+ @Override
+ public boolean remove(Object o) {
+ boolean modified = super.remove(o);
+ if (modified) {
+ updateCachedHashcode();
+ }
+ return modified;
+ }
+
+ @Override
+ public boolean removeAll(Collection> collection) {
+ boolean modified = super.removeAll(collection);
+ if (modified) {
+ updateCachedHashcode();
+ }
+ return modified;
+ }
+
+ @Override
+ public boolean removeIf(Predicate super GroupingAttribute>> filter) {
+ boolean modified = super.removeIf(filter);
+ if (modified) {
+ updateCachedHashcode();
+ }
+ return modified;
+ }
+
+ @Override
+ public void clear() {
+ super.clear();
+ updateCachedHashcode();
+ }
+
+ /**
+ * Returns the cached hashcode.
+ *
+ * @return the hashcode
+ */
+ @Override
+ public int hashCode() {
+ return cachedHashcode;
+ }
+
+ /**
+ * Update the cached hashcode based on the current elements.
+ */
+ private void updateCachedHashcode() {
+ cachedHashcode = super.hashCode();
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupingAttribute.java b/warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupingAttribute.java
new file mode 100644
index 0000000000..54db8ddf71
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/common/grouping/GroupingAttribute.java
@@ -0,0 +1,62 @@
+package datawave.query.common.grouping;
+
+import org.apache.accumulo.core.data.Key;
+import org.apache.commons.lang.builder.HashCodeBuilder;
+
+import datawave.data.type.Type;
+import datawave.query.attributes.Attribute;
+import datawave.query.attributes.TypeAttribute;
+
+/**
+ * This class serves as a wrapper for the {@link TypeAttribute} that overrides the default {@code equals()} and {@code hashCode()} behavior so that equality is
+ * determined by the attribute's field and value, and the hashCode is generated solely with the attribute's value.
+ *
+ * @param