From 8bb53d6b8801b59b5b13817534c45ee69bef60f5 Mon Sep 17 00:00:00 2001 From: Vikash Kumar Date: Tue, 13 Jun 2023 02:51:33 +0530 Subject: [PATCH] Support iceberg `register_table` procedure to register hadoop tables Co-authored-by: Marius Grama --- .../io/trino/plugin/iceberg/IcebergUtil.java | 9 +++ .../TestIcebergRegisterTableProcedure.java | 68 +++++++++++++++++++ .../trino/plugin/iceberg/TestIcebergUtil.java | 13 ++++ 3 files changed, 90 insertions(+) diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergUtil.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergUtil.java index 66269ce6ce0bd..e97b8b1b3d75a 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergUtil.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergUtil.java @@ -172,6 +172,11 @@ public final class IcebergUtil // - 00001-409702ba-4735-4645-8f14-09537cc0b2c8.gz.metadata.json (https://github.com/apache/iceberg/blob/ab398a0d5ff195f763f8c7a4358ac98fa38a8de7/core/src/main/java/org/apache/iceberg/TableMetadataParser.java#L141) // - 00001-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json.gz (https://github.com/apache/iceberg/blob/ab398a0d5ff195f763f8c7a4358ac98fa38a8de7/core/src/main/java/org/apache/iceberg/TableMetadataParser.java#L146) private static final Pattern METADATA_FILE_NAME_PATTERN = Pattern.compile("(?\\d+)-(?[-a-fA-F0-9]*)(?\\.[a-zA-Z0-9]+)?" + Pattern.quote(METADATA_FILE_EXTENSION) + "(?\\.[a-zA-Z0-9]+)?"); + // Hadoop Generated Metadata file name examples + // - v0.metadata.json + // - v0.gz.metadata.json + // - v0.metadata.json.gz + private static final Pattern HADOOP_GENERATED_METADATA_FILE_NAME_PATTERN = Pattern.compile("v(?\\d+)(?\\.[a-zA-Z0-9]+)?" + Pattern.quote(METADATA_FILE_EXTENSION) + "(?\\.[a-zA-Z0-9]+)?"); private IcebergUtil() {} @@ -716,6 +721,10 @@ public static int parseVersion(String metadataFileName) if (matcher.matches()) { return parseInt(matcher.group("version")); } + matcher = HADOOP_GENERATED_METADATA_FILE_NAME_PATTERN.matcher(metadataFileName); + if (matcher.matches()) { + return parseInt(matcher.group("version")); + } throw new TrinoException(ICEBERG_BAD_DATA, "Invalid metadata file name: " + metadataFileName); } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergRegisterTableProcedure.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergRegisterTableProcedure.java index 19283667c0057..701480917dbfe 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergRegisterTableProcedure.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergRegisterTableProcedure.java @@ -13,6 +13,7 @@ */ package io.trino.plugin.iceberg; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.trino.filesystem.FileEntry; import io.trino.filesystem.FileIterator; @@ -22,6 +23,15 @@ import io.trino.testing.AbstractTestQueryFramework; import io.trino.testing.MaterializedResult; import io.trino.testing.QueryRunner; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.types.Types; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -38,6 +48,7 @@ import static com.google.common.base.Verify.verify; import static com.google.common.io.MoreFiles.deleteRecursively; import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; import static io.trino.plugin.iceberg.IcebergUtil.METADATA_FOLDER_NAME; @@ -46,6 +57,7 @@ import static io.trino.testing.TestingNames.randomNameSuffix; import static java.lang.String.format; import static java.util.Locale.ENGLISH; +import static org.apache.iceberg.Files.localInput; import static org.assertj.core.api.Assertions.assertThat; public class TestIcebergRegisterTableProcedure @@ -425,6 +437,62 @@ public void testRegisterTableWithInvalidMetadataFileName() } } + @Test + public void testRegisterHadoopTableAndRead() + { + // create a temporary table to generate data file + String tempTableName = "temp_table_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tempTableName + " (id INT, name VARCHAR) WITH (format = 'ORC')"); + assertUpdate("INSERT INTO " + tempTableName + " values(1, 'INDIA')", 1); + String dataFilePath = (String) computeScalar("SELECT \"$path\" FROM " + tempTableName); + + // create hadoop table + String hadoopTableName = "hadoop_table_" + randomNameSuffix(); + String hadoopTableLocation = metastoreDir.getPath() + "/" + hadoopTableName; + HadoopTables hadoopTables = new HadoopTables(newEmptyConfiguration()); + Schema schema = new Schema(ImmutableList.of( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "name", Types.StringType.get()))); + Table table = hadoopTables.create( + schema, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + ImmutableMap.of("write.format.default", "ORC"), + hadoopTableLocation); + + // append data file to hadoop table + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFormat(FileFormat.ORC) + .withInputFile(localInput(new File(dataFilePath))) + .withPath(dataFilePath) + .withRecordCount(1) + .build(); + table.newFastAppend() + .appendFile(dataFile) + .commit(); + + // Hadoop style version number + assertThat(Location.of(getLatestMetadataLocation(fileSystem, hadoopTableLocation)).fileName()) + .isEqualTo("v2.metadata.json"); + + // Try registering hadoop table in Trino and read it + String registeredTableName = "registered_table_" + randomNameSuffix(); + assertUpdate("CALL system.register_table(CURRENT_SCHEMA, '%s', '%s')".formatted(registeredTableName, hadoopTableLocation)); + assertQuery("SELECT * FROM " + registeredTableName, "VALUES (1, 'INDIA')"); + + // Verify the table can be written to despite using non-standard metadata file name + assertUpdate("INSERT INTO " + registeredTableName + " VALUES (2, 'POLAND')", 1); + assertQuery("SELECT * FROM " + registeredTableName, "VALUES (1, 'INDIA'), (2, 'POLAND')"); + + // New metadata file is written using standard file name convention + assertThat(Location.of(getLatestMetadataLocation(fileSystem, hadoopTableLocation)).fileName()) + .matches("00003-.*\\.metadata\\.json"); + + assertUpdate("DROP TABLE " + registeredTableName); + assertUpdate("DROP TABLE " + tempTableName); + } + private String getTableLocation(String tableName) { Pattern locationPattern = Pattern.compile(".*location = '(.*?)'.*", Pattern.DOTALL); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergUtil.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergUtil.java index b4997a2788d92..f0e2ac5358a95 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergUtil.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergUtil.java @@ -28,6 +28,11 @@ public void testParseVersion() assertEquals(parseVersion("99999-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json"), 99999); assertEquals(parseVersion("00010-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json"), 10); assertEquals(parseVersion("00011-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json"), 11); + assertEquals(parseVersion("v0.metadata.json"), 0); + assertEquals(parseVersion("v10.metadata.json"), 10); + assertEquals(parseVersion("v99999.metadata.json"), 99999); + assertEquals(parseVersion("v0.gz.metadata.json"), 0); + assertEquals(parseVersion("v0.metadata.json.gz"), 0); assertThatThrownBy(() -> parseVersion("hdfs://hadoop-master:9000/user/hive/warehouse/orders_5-581fad8517934af6be1857a903559d44/metadata/00000-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json")) .hasMessageMatching("Not a file name: .*"); @@ -37,10 +42,18 @@ public void testParseVersion() .hasMessageMatching("Invalid metadata file name:.*"); assertThatThrownBy(() -> parseVersion("00010_409702ba_4735_4645_8f14_09537cc0b2c8.metadata.json")) .hasMessageMatching("Invalid metadata file name:.*"); + assertThatThrownBy(() -> parseVersion("v10_metadata_json")) + .hasMessageMatching("Invalid metadata file name:.*"); + assertThatThrownBy(() -> parseVersion("v1..gz.metadata.json")) + .hasMessageMatching("Invalid metadata file name:.*"); + assertThatThrownBy(() -> parseVersion("v1.metadata.json.gz.")) + .hasMessageMatching("Invalid metadata file name:.*"); assertThatThrownBy(() -> parseVersion("00003_409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json")) .hasMessageMatching("Invalid metadata file name:.*"); assertThatThrownBy(() -> parseVersion("-00010-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json")) .hasMessageMatching("Invalid metadata file name:.*"); + assertThatThrownBy(() -> parseVersion("v-10.metadata.json")) + .hasMessageMatching("Invalid metadata file name:.*"); } }