Support Managed Iceberg streaming writes (#32451)

* iceberg streaming writes * cleanup * adress comments
apache · Sep 19, 2024 · 75a4637 · 75a4637
1 parent ec307a5
commit 75a4637
Show file tree

Hide file tree

Showing 12 changed files with 355 additions and 42 deletions.
diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json
@@ -1,4 +1,4 @@
 {
     "comment": "Modify this file in a trivial way to cause this test suite to run",
-    "modification": 1
+    "modification": 2
 }
diff --git a/sdks/java/io/expansion-service/build.gradle b/sdks/java/io/expansion-service/build.gradle
@@ -47,8 +47,7 @@ dependencies {
   // **** IcebergIO runtime dependencies ****
   runtimeOnly library.java.hadoop_client
   // Needed when using GCS as the warehouse location.
-  implementation library.java.bigdataoss_gcs_connector
-  permitUnusedDeclared library.java.bigdataoss_gcs_connector
+  runtimeOnly library.java.bigdataoss_gcs_connector
   // Needed for HiveCatalog
   runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:1.4.2")
   runtimeOnly project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow")

diff --git a/sdks/java/io/iceberg/hive/exec/build.gradle b/sdks/java/io/iceberg/hive/exec/build.gradle
@@ -39,10 +39,17 @@ artifacts {
 shadowJar {
     zip64 true
 
-    relocate 'com.google.common', getJavaRelocatedPath('iceberg.hive.com.google.common')
-    relocate 'com.google.protobuf', getJavaRelocatedPath('iceberg.hive.com.google.protobuf')
-    relocate 'shaded.parquet', getJavaRelocatedPath('iceberg.hive.shaded.parquet')
-    relocate 'org.apache.parquet', getJavaRelocatedPath('iceberg.hive.org.apache.parquet')
+    def problematicPackages = [
+            'com.google.protobuf',
+            'com.google.common',
+            'shaded.parquet',
+            'org.apache.parquet',
+            'org.joda'
+    ]
+
+    problematicPackages.forEach {
+        relocate it, getJavaRelocatedPath("iceberg.hive.${it}")
+    }
 
     version "3.1.3"
     mergeServiceFiles()

diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java
@@ -19,6 +19,8 @@
 
 import org.apache.beam.sdk.coders.KvCoder;
 import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.metrics.Counter;
+import org.apache.beam.sdk.metrics.Metrics;
 import org.apache.beam.sdk.transforms.DoFn;
 import org.apache.beam.sdk.transforms.GroupByKey;
 import org.apache.beam.sdk.transforms.PTransform;
@@ -29,14 +31,17 @@
 import org.apache.beam.sdk.values.KV;
 import org.apache.beam.sdk.values.PCollection;
 import org.apache.iceberg.AppendFiles;
+import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.catalog.Catalog;
 import org.apache.iceberg.catalog.TableIdentifier;
 import org.checkerframework.checker.nullness.qual.MonotonicNonNull;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 class AppendFilesToTables
     extends PTransform<PCollection<FileWriteResult>, PCollection<KV<String, SnapshotInfo>>> {
-
+  private static final Logger LOG = LoggerFactory.getLogger(AppendFilesToTables.class);
   private final IcebergCatalogConfig catalogConfig;
 
   AppendFilesToTables(IcebergCatalogConfig catalogConfig) {
@@ -66,6 +71,8 @@ public String apply(FileWriteResult input) {
 
   private static class AppendFilesToTablesDoFn
       extends DoFn<KV<String, Iterable<FileWriteResult>>, KV<String, SnapshotInfo>> {
+    private final Counter snapshotsCreated =
+        Metrics.counter(AppendFilesToTables.class, "snapshotsCreated");
 
     private final IcebergCatalogConfig catalogConfig;
 
@@ -87,15 +94,21 @@ public void processElement(
         @Element KV<String, Iterable<FileWriteResult>> element,
         OutputReceiver<KV<String, SnapshotInfo>> out,
         BoundedWindow window) {
+      if (!element.getValue().iterator().hasNext()) {
+        return;
+      }
+
       Table table = getCatalog().loadTable(TableIdentifier.parse(element.getKey()));
       AppendFiles update = table.newAppend();
       for (FileWriteResult writtenFile : element.getValue()) {
         update.appendManifest(writtenFile.getManifestFile());
       }
       update.commit();
+      Snapshot snapshot = table.currentSnapshot();
+      LOG.info("Created new snapshot for table '{}': {}.", element.getKey(), snapshot);
+      snapshotsCreated.inc();
       out.outputWithTimestamp(
-          KV.of(element.getKey(), SnapshotInfo.fromSnapshot(table.currentSnapshot())),
-          window.maxTimestamp());
+          KV.of(element.getKey(), SnapshotInfo.fromSnapshot(snapshot)), window.maxTimestamp());
     }
   }
 }
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java
@@ -17,6 +17,7 @@
  */
 package org.apache.beam.sdk.io.iceberg;
 
+import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull;
 import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull;
 
 import com.google.auto.value.AutoValue;
@@ -25,6 +26,12 @@
 import org.apache.beam.sdk.annotations.Internal;
 import org.apache.beam.sdk.io.Read;
 import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.windowing.AfterFirst;
+import org.apache.beam.sdk.transforms.windowing.AfterPane;
+import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
+import org.apache.beam.sdk.transforms.windowing.Repeatedly;
+import org.apache.beam.sdk.transforms.windowing.Window;
 import org.apache.beam.sdk.values.PBegin;
 import org.apache.beam.sdk.values.PCollection;
 import org.apache.beam.sdk.values.Row;
@@ -33,6 +40,7 @@
 import org.apache.iceberg.Table;
 import org.apache.iceberg.catalog.TableIdentifier;
 import org.checkerframework.checker.nullness.qual.Nullable;
+import org.joda.time.Duration;
 
 /**
  * The underlying Iceberg connector used by {@link org.apache.beam.sdk.managed.Managed#ICEBERG}. Not
@@ -49,13 +57,16 @@ public static WriteRows writeRows(IcebergCatalogConfig catalog) {
 
   @AutoValue
   public abstract static class WriteRows extends PTransform<PCollection<Row>, IcebergWriteResult> {
+    private static final int TRIGGERING_RECORD_COUNT = 50_000;
 
     abstract IcebergCatalogConfig getCatalogConfig();
 
     abstract @Nullable TableIdentifier getTableIdentifier();
 
     abstract @Nullable DynamicDestinations getDynamicDestinations();
 
+    abstract @Nullable Duration getTriggeringFrequency();
+
     abstract Builder toBuilder();
 
     @AutoValue.Builder
@@ -66,6 +77,8 @@ abstract static class Builder {
 
       abstract Builder setDynamicDestinations(DynamicDestinations destinations);
 
+      abstract Builder setTriggeringFrequency(Duration triggeringFrequency);
+
       abstract WriteRows build();
     }
 
@@ -77,6 +90,21 @@ public WriteRows to(DynamicDestinations destinations) {
       return toBuilder().setDynamicDestinations(destinations).build();
     }
 
+    /**
+     * Sets the frequency at which data is committed and a new {@link org.apache.iceberg.Snapshot}
+     * is produced.
+     *
+     * <p>Roughly every triggeringFrequency duration, this connector will try to accumulate all
+     * {@link org.apache.iceberg.ManifestFile}s and commit them to the table as appended files. Each
+     * commit results in a new table {@link org.apache.iceberg.Snapshot}.
+     *
+     * <p>This is only applicable when writing an unbounded {@link PCollection} (i.e. a streaming
+     * pipeline).
+     */
+    public WriteRows withTriggeringFrequency(Duration triggeringFrequency) {
+      return toBuilder().setTriggeringFrequency(triggeringFrequency).build();
+    }
+
     @Override
     public IcebergWriteResult expand(PCollection<Row> input) {
       List<?> allToArgs = Arrays.asList(getTableIdentifier(), getDynamicDestinations());
@@ -89,11 +117,32 @@ public IcebergWriteResult expand(PCollection<Row> input) {
         destinations =
             DynamicDestinations.singleTable(Preconditions.checkNotNull(getTableIdentifier()));
       }
+
+      if (input.isBounded().equals(PCollection.IsBounded.UNBOUNDED)) {
+        Duration triggeringFrequency = getTriggeringFrequency();
+        checkArgumentNotNull(
+            triggeringFrequency, "Streaming pipelines must set a triggering frequency.");
+        input =
+            input.apply(
+                "WindowIntoGlobal",
+                Window.<Row>into(new GlobalWindows())
+                    .triggering(
+                        Repeatedly.forever(
+                            AfterFirst.of(
+                                AfterProcessingTime.pastFirstElementInPane()
+                                    .plusDelayOf(triggeringFrequency),
+                                AfterPane.elementCountAtLeast(TRIGGERING_RECORD_COUNT))))
+                    .discardingFiredPanes());
+      } else {
+        Preconditions.checkArgument(
+            getTriggeringFrequency() == null,
+            "Triggering frequency is only applicable for streaming pipelines.");
+      }
       return input
           .apply("Set Destination Metadata", new AssignDestinations(destinations))
           .apply(
               "Write Rows to Destinations",
-              new WriteToDestinations(getCatalogConfig(), destinations));
+              new WriteToDestinations(getCatalogConfig(), destinations, getTriggeringFrequency()));
     }
   }
 

diff --git a/...erg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java b/...erg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java
@@ -17,13 +17,20 @@
  */
 package org.apache.beam.sdk.io.iceberg;
 
+import static org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider.Configuration;
+
 import com.google.auto.service.AutoService;
+import com.google.auto.value.AutoValue;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import org.apache.beam.sdk.managed.ManagedTransformConstants;
+import org.apache.beam.sdk.schemas.AutoValueSchema;
 import org.apache.beam.sdk.schemas.NoSuchSchemaException;
 import org.apache.beam.sdk.schemas.Schema;
 import org.apache.beam.sdk.schemas.SchemaRegistry;
+import org.apache.beam.sdk.schemas.annotations.DefaultSchema;
+import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription;
 import org.apache.beam.sdk.schemas.transforms.SchemaTransform;
 import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider;
 import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider;
@@ -35,14 +42,16 @@
 import org.apache.beam.sdk.values.Row;
 import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.catalog.TableIdentifier;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.joda.time.Duration;
 
 /**
  * SchemaTransform implementation for {@link IcebergIO#writeRows}. Writes Beam Rows to Iceberg and
  * outputs a {@code PCollection<Row>} representing snapshots created in the process.
  */
 @AutoService(SchemaTransformProvider.class)
 public class IcebergWriteSchemaTransformProvider
-    extends TypedSchemaTransformProvider<SchemaTransformConfiguration> {
+    extends TypedSchemaTransformProvider<Configuration> {
 
   static final String INPUT_TAG = "input";
   static final String OUTPUT_TAG = "output";
@@ -57,8 +66,55 @@ public String description() {
         + "{\"table\" (str), \"operation\" (str), \"summary\" (map[str, str]), \"manifestListLocation\" (str)}";
   }
 
+  @DefaultSchema(AutoValueSchema.class)
+  @AutoValue
+  public abstract static class Configuration {
+    public static Builder builder() {
+      return new AutoValue_IcebergWriteSchemaTransformProvider_Configuration.Builder();
+    }
+
+    @SchemaFieldDescription("Identifier of the Iceberg table.")
+    public abstract String getTable();
+
+    @SchemaFieldDescription("Name of the catalog containing the table.")
+    public abstract @Nullable String getCatalogName();
+
+    @SchemaFieldDescription("Properties used to set up the Iceberg catalog.")
+    public abstract @Nullable Map<String, String> getCatalogProperties();
+
+    @SchemaFieldDescription("Properties passed to the Hadoop Configuration.")
+    public abstract @Nullable Map<String, String> getConfigProperties();
+
+    @SchemaFieldDescription(
+        "For a streaming pipeline, sets the frequency at which snapshots are produced.")
+    public abstract @Nullable Integer getTriggeringFrequencySeconds();
+
+    @AutoValue.Builder
+    public abstract static class Builder {
+      public abstract Builder setTable(String table);
+
+      public abstract Builder setCatalogName(String catalogName);
+
+      public abstract Builder setCatalogProperties(Map<String, String> catalogProperties);
+
+      public abstract Builder setConfigProperties(Map<String, String> confProperties);
+
+      public abstract Builder setTriggeringFrequencySeconds(Integer triggeringFrequencySeconds);
+
+      public abstract Configuration build();
+    }
+
+    public IcebergCatalogConfig getIcebergCatalog() {
+      return IcebergCatalogConfig.builder()
+          .setCatalogName(getCatalogName())
+          .setCatalogProperties(getCatalogProperties())
+          .setConfigProperties(getConfigProperties())
+          .build();
+    }
+  }
+
   @Override
-  protected SchemaTransform from(SchemaTransformConfiguration configuration) {
+  protected SchemaTransform from(Configuration configuration) {
     return new IcebergWriteSchemaTransform(configuration);
   }
 
@@ -78,9 +134,9 @@ public String identifier() {
   }
 
   static class IcebergWriteSchemaTransform extends SchemaTransform {
-    private final SchemaTransformConfiguration configuration;
+    private final Configuration configuration;
 
-    IcebergWriteSchemaTransform(SchemaTransformConfiguration configuration) {
+    IcebergWriteSchemaTransform(Configuration configuration) {
       this.configuration = configuration;
     }
 
@@ -89,7 +145,7 @@ Row getConfigurationRow() {
         // To stay consistent with our SchemaTransform configuration naming conventions,
         // we sort lexicographically and convert field names to snake_case
         return SchemaRegistry.createDefault()
-            .getToRowFunction(SchemaTransformConfiguration.class)
+            .getToRowFunction(Configuration.class)
             .apply(configuration)
             .sorted()
             .toSnakeCase();
@@ -102,11 +158,17 @@ Row getConfigurationRow() {
     public PCollectionRowTuple expand(PCollectionRowTuple input) {
       PCollection<Row> rows = input.get(INPUT_TAG);
 
+      IcebergIO.WriteRows writeTransform =
+          IcebergIO.writeRows(configuration.getIcebergCatalog())
+              .to(TableIdentifier.parse(configuration.getTable()));
+
+      Integer trigFreq = configuration.getTriggeringFrequencySeconds();
+      if (trigFreq != null) {
+        writeTransform = writeTransform.withTriggeringFrequency(Duration.standardSeconds(trigFreq));
+      }
+
       // TODO: support dynamic destinations
-      IcebergWriteResult result =
-          rows.apply(
-              IcebergIO.writeRows(configuration.getIcebergCatalog())
-                  .to(TableIdentifier.parse(configuration.getTable())));
+      IcebergWriteResult result = rows.apply(writeTransform);
 
       PCollection<Row> snapshots =
           result

diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java
@@ -36,7 +36,8 @@
 
 class RecordWriter {
   private static final Logger LOG = LoggerFactory.getLogger(RecordWriter.class);
-  private final Counter activeWriters = Metrics.counter(RecordWriterManager.class, "activeWriters");
+  private final Counter activeIcebergWriters =
+      Metrics.counter(RecordWriterManager.class, "activeIcebergWriters");
   private final DataWriter<Record> icebergDataWriter;
   private final Table table;
   private final String absoluteFilename;
@@ -92,7 +93,7 @@ class RecordWriter {
       default:
         throw new RuntimeException("Unknown File Format: " + fileFormat);
     }
-    activeWriters.inc();
+    activeIcebergWriters.inc();
     LOG.info(
         "Opened {} writer for table {}, partition {}. Writing to path: {}",
         fileFormat,
@@ -115,7 +116,7 @@ public void close() throws IOException {
               fileFormat, table.name(), absoluteFilename),
           e);
     }
-    activeWriters.dec();
+    activeIcebergWriters.dec();
     LOG.info("Closed {} writer for table {}, path: {}", fileFormat, table.name(), absoluteFilename);
   }