From 912ff882599af51db07c7ad68b4a8520233945d7 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Wed, 28 Aug 2024 16:24:36 -0400 Subject: [PATCH 01/14] Bumped to 1.1-SNAPSHOT --- build.gradle | 8 ++++---- gradle.properties | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/build.gradle b/build.gradle index 98e49f05..7009e1ca 100644 --- a/build.gradle +++ b/build.gradle @@ -10,10 +10,10 @@ subprojects { repositories { mavenCentral() -// mavenLocal() -// maven { -// url "https://bed-artifactory.bedford.progress.com:443/artifactory/ml-maven-snapshots/" -// } + mavenLocal() + maven { + url "https://bed-artifactory.bedford.progress.com:443/artifactory/ml-maven-snapshots/" + } } test { diff --git a/gradle.properties b/gradle.properties index 462fa3a8..ec1090d2 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,4 +1,4 @@ -version=1.0.0 +version=1.1-SNAPSHOT # Define these on the command line to publish to OSSRH # See https://central.sonatype.org/publish/publish-gradle/#credentials for more information From 2925e6d9959d7a97f2ab019ec084ccf5fe0ec7dc Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Mon, 23 Sep 2024 08:14:11 -0400 Subject: [PATCH 02/14] MLE-17041 Added streaming option for importing and exporting files This is largely just declaring `--streaming` and ensuring it's sent to the connector correctly. --- flux-cli/build.gradle | 2 +- .../flux/api/GenericFilesExporter.java | 2 + .../flux/api/GenericFilesImporter.java | 2 + .../flux/impl/export/ExportFilesCommand.java | 37 +++++++++++++++++-- .../impl/importdata/ImportFilesCommand.java | 27 +++++++++++++- .../flux/api/GenericFilesImporterTest.java | 2 + .../impl/export/ExportFilesOptionsTest.java | 21 +++++++++++ .../importdata/ImportFilesOptionsTest.java | 9 +++-- .../flux/impl/importdata/ImportFilesTest.java | 5 ++- gradle.properties | 2 +- 10 files changed, 98 insertions(+), 11 deletions(-) diff --git a/flux-cli/build.gradle b/flux-cli/build.gradle index c0075178..7a786f95 100644 --- a/flux-cli/build.gradle +++ b/flux-cli/build.gradle @@ -17,7 +17,7 @@ dependencies { // The rocksdbjni dependency weighs in at 50mb and so far does not appear necessary for our use of Spark. exclude module: "rocksdbjni" } - implementation "com.marklogic:marklogic-spark-connector:2.3.1" + implementation "com.marklogic:marklogic-spark-connector:2.3-SNAPSHOT" implementation "info.picocli:picocli:4.7.6" // Spark 3.4.3 depends on Hadoop 3.3.4, which depends on AWS SDK 1.12.262. As of August 2024, all public releases of diff --git a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java index 854dd1e1..e4cf0811 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java +++ b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java @@ -32,6 +32,8 @@ interface WriteGenericFilesOptions { GenericFilesExporter from(Consumer> consumer); + GenericFilesExporter streaming(); + GenericFilesExporter to(Consumer consumer); GenericFilesExporter to(String path); diff --git a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java index b313f875..555b666d 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java +++ b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java @@ -30,5 +30,7 @@ interface WriteGenericDocumentsOptions extends WriteDocumentsOptions consumer); } diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportFilesCommand.java index a0275042..bb9992e7 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportFilesCommand.java +++ b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportFilesCommand.java @@ -30,6 +30,13 @@ public class ExportFilesCommand extends AbstractCommand im @CommandLine.Mixin protected WriteGenericFilesParams writeParams = new WriteGenericFilesParams(); + @CommandLine.Option( + names = "--streaming", + description = "Causes documents to be read from MarkLogic and streamed to the file source. Intended for " + + "exporting large files that cannot be fully read into memory." + ) + private boolean streaming; + @Override protected void validateDuringApiUsage() { readParams.verifyAtLeastOneQueryOptionIsSet("export"); @@ -50,7 +57,7 @@ protected Dataset loadDataset(SparkSession session, DataFrameReader reader) } return reader.format(MARKLOGIC_CONNECTOR) .options(getConnectionParams().makeOptions()) - .options(readParams.makeOptions()) + .options(buildReadOptions()) .load(); } @@ -58,12 +65,30 @@ protected Dataset loadDataset(SparkSession session, DataFrameReader reader) protected void applyWriter(SparkSession session, DataFrameWriter writer) { writeParams.s3Params.addToHadoopConfiguration(session.sparkContext().hadoopConfiguration()); writer.format(MARKLOGIC_CONNECTOR) - .options(writeParams.get()) + .options(buildWriteOptions()) // The connector only supports "Append" in terms of how Spark defines it, but it will always overwrite files. .mode(SaveMode.Append) .save(writeParams.path); } + protected final Map buildReadOptions() { + Map options = readParams.makeOptions(); + if (this.streaming) { + options.put(Options.STREAM_FILES, "true"); + } + return options; + } + + protected final Map buildWriteOptions() { + Map options = writeParams.get(); + if (this.streaming) { + options.put(Options.STREAM_FILES, "true"); + // Need connection information so that the writer can retrieve documents from MarkLogic. + options.putAll(getConnectionParams().makeOptions()); + } + return options; + } + public static class WriteGenericFilesParams implements Supplier>, WriteGenericFilesOptions { @CommandLine.Option(required = true, names = "--path", description = "Path expression for where files should be written.") @@ -89,7 +114,7 @@ public static class WriteGenericFilesParams implements Supplier get() { return OptionsUtil.makeOptions( Options.WRITE_FILES_COMPRESSION, compressionType != null ? compressionType.name() : null, - Options.WRITE_FILES_PRETTY_PRINT, prettyPrint ? "true": null, + Options.WRITE_FILES_PRETTY_PRINT, prettyPrint ? "true" : null, Options.WRITE_FILES_ENCODING, encoding ); } @@ -172,4 +197,10 @@ public GenericFilesExporter to(String path) { writeParams.path(path); return this; } + + @Override + public GenericFilesExporter streaming() { + this.streaming = true; + return this; + } } diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportFilesCommand.java index 4655d033..e6bb064b 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportFilesCommand.java +++ b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportFilesCommand.java @@ -24,6 +24,20 @@ public class ImportFilesCommand extends AbstractImportFilesCommand consumer) { consumer.accept(readParams); @@ -69,6 +86,8 @@ public static class ReadGenericFilesParams extends ReadFilesParams makeOptions() { return OptionsUtil.addOptions(super.makeOptions(), Options.READ_NUM_PARTITIONS, OptionsUtil.intOption(partitions), Options.READ_FILES_COMPRESSION, compressionType != null ? compressionType.name() : null, - Options.READ_FILES_ENCODING, encoding + Options.READ_FILES_ENCODING, encoding, + Options.STREAM_FILES, streaming ? "true" : null ); } @@ -101,6 +121,8 @@ public static class WriteGenericDocumentsParams extends WriteDocumentParams makeOptions() { return OptionsUtil.addOptions(super.makeOptions(), - Options.WRITE_DOCUMENT_TYPE, documentType != null ? documentType.name() : null + Options.WRITE_DOCUMENT_TYPE, documentType != null ? documentType.name() : null, + Options.STREAM_FILES, streaming ? "true" : null ); } } diff --git a/flux-cli/src/test/java/com/marklogic/flux/api/GenericFilesImporterTest.java b/flux-cli/src/test/java/com/marklogic/flux/api/GenericFilesImporterTest.java index f553ee99..d85aa1c9 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/api/GenericFilesImporterTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/api/GenericFilesImporterTest.java @@ -20,6 +20,8 @@ void test() { Flux.importGenericFiles() .connectionString(makeConnectionString()) .from(PATH) + // Including streaming just for smoke testing and manual inspection of log messages. + .streaming() .to(options -> options .collectionsString("api-files,second-collection") .permissionsString(DEFAULT_PERMISSIONS)) diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportFilesOptionsTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportFilesOptionsTest.java index 46baf047..41b90d49 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportFilesOptionsTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportFilesOptionsTest.java @@ -58,4 +58,25 @@ void encoding() { Map options = command.writeParams.get(); assertEquals("ISO-8859-1", options.get(Options.WRITE_FILES_ENCODING)); } + + @Test + void streaming() { + ExportFilesCommand command = (ExportFilesCommand) getCommand( + "export-files", + "--connection-string", "test:test@host:8000", + "--collections", "anything", + "--path", "anywhere", + "--streaming" + ); + + Map readOptions = command.buildReadOptions(); + assertEquals("true", readOptions.get(Options.STREAM_FILES)); + assertEquals("anything", readOptions.get(Options.READ_DOCUMENTS_COLLECTIONS)); + + Map writeOptions = command.buildWriteOptions(); + assertEquals("true", writeOptions.get(Options.STREAM_FILES)); + assertEquals("test:test@host:8000", writeOptions.get(Options.CLIENT_URI), + "The connection options must be present in the write options so that the writer can connect " + + "to MarkLogic and read documents."); + } } diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesOptionsTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesOptionsTest.java index 1f7558cc..d6b9bca1 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesOptionsTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesOptionsTest.java @@ -34,7 +34,8 @@ void test() { "--uri-prefix", "/prefix", "--uri-replace", ".*value,''", "--uri-suffix", ".suffix", - "--uri-template", "/test/{value}.json" + "--uri-template", "/test/{value}.json", + "--streaming" ); assertOptions(command.getConnectionParams().makeOptions(), @@ -46,7 +47,8 @@ void test() { assertOptions(command.getReadParams().makeOptions(), Options.READ_NUM_PARTITIONS, "6", - Options.READ_FILES_ENCODING, "UTF-16" + Options.READ_FILES_ENCODING, "UTF-16", + Options.STREAM_FILES, "true" ); assertOptions(command.getWriteParams().makeOptions(), @@ -64,7 +66,8 @@ void test() { Options.WRITE_URI_PREFIX, "/prefix", Options.WRITE_URI_REPLACE, ".*value,''", Options.WRITE_URI_SUFFIX, ".suffix", - Options.WRITE_URI_TEMPLATE, "/test/{value}.json" + Options.WRITE_URI_TEMPLATE, "/test/{value}.json", + Options.STREAM_FILES, "true" ); } } diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesTest.java index 3850fb49..58b2582f 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesTest.java @@ -36,7 +36,10 @@ void multiplePaths() { // Including these for manual verification of progress logging. "--batch-size", "1", - "--log-progress", "2" + "--log-progress", "2", + + // Including for smoke testing and manual verification of logging. + "--streaming" ); verifyDocsWereWritten(uris.length, uris); diff --git a/gradle.properties b/gradle.properties index ec1090d2..c8a61b98 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,4 +1,4 @@ -version=1.1-SNAPSHOT +version=1.0.1-SNAPSHOT # Define these on the command line to publish to OSSRH # See https://central.sonatype.org/publish/publish-gradle/#credentials for more information From 5975b8ac0ef6ade64177fd83009dfd9f884b308a Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Mon, 23 Sep 2024 16:27:00 -0400 Subject: [PATCH 03/14] MLE-16859 Export commands now default to "append" --- docs/export/export-rows.md | 4 +++ .../export/WriteStructuredFilesParams.java | 2 +- .../ExportDelimitedFilesCommandTest.java | 25 +++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/docs/export/export-rows.md b/docs/export/export-rows.md index 05fbd6e2..3f869f8c 100644 --- a/docs/export/export-rows.md +++ b/docs/export/export-rows.md @@ -306,6 +306,10 @@ location where data already exists. This option supports the following values: For convenience, the above values are case-sensitive so that you can ignore casing when choosing a value. +As of the 1.0.1 release of Flux, `--mode` defaults to `Append` for commands that write to a filesystem. In the 1.0.0 +release, these commands defaulted to `Overwrite`. The `export-jdbc` command defaults to `ErrorIfExists` avoid altering +an existing table in any way. + For further information on each mode, please see [the Spark documentation](https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html#save-modes). diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/export/WriteStructuredFilesParams.java b/flux-cli/src/main/java/com/marklogic/flux/impl/export/WriteStructuredFilesParams.java index 521cabb4..c9b2ae87 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/impl/export/WriteStructuredFilesParams.java +++ b/flux-cli/src/main/java/com/marklogic/flux/impl/export/WriteStructuredFilesParams.java @@ -17,7 +17,7 @@ public abstract class WriteStructuredFilesParams ex description = "Specifies how data is written if the path already exists. " + "See %nhttps://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/SaveMode.html for more information. " + OptionsUtil.VALID_VALUES_DESCRIPTION) - private SaveMode saveMode = SaveMode.OVERWRITE; + private SaveMode saveMode = SaveMode.APPEND; protected WriteStructuredFilesParams() { // For Avro/Parquet/etc files, writing many rows to a single file is acceptable and expected. diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportDelimitedFilesCommandTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportDelimitedFilesCommandTest.java index f7b20595..fb0e7efe 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportDelimitedFilesCommandTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportDelimitedFilesCommandTest.java @@ -31,6 +31,31 @@ void test(@TempDir Path tempDir) throws IOException { verifyDelimitedFile(tempDir); } + @Test + void exportTwice(@TempDir Path tempDir) { + run( + "export-delimited-files", + "--connection-string", makeConnectionString(), + "--partitions", "1", + "--query", "op.fromView('Medical', 'Authors', '')", + "--path", tempDir.toFile().getAbsolutePath(), + "--file-count", "1" + ); + + run( + "export-delimited-files", + "--connection-string", makeConnectionString(), + "--partitions", "1", + "--query", "op.fromView('Medical', 'Authors', '')", + "--path", tempDir.toFile().getAbsolutePath(), + "--file-count", "1" + ); + + File[] files = tempDir.toFile().listFiles((dir, name) -> name.endsWith(".csv")); + assertEquals(2, files.length, "Per MLE-16859, commands that use Spark file data sources for exporting data " + + "should default to 'append' instead of 'overwrite' to avoid accidentally deleting data."); + } + /** * Verifies that an options file can have values with whitespace in them, which picocli supports and * JCommander oddly does not. Users will frequently have spaces in any Optic or search queries that they put into From a694e4457247a016ff240a24895fb9499851bb9b Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Tue, 24 Sep 2024 10:34:18 -0400 Subject: [PATCH 04/14] Added docs for streaming Plus a couple cute little "since" annotations. --- docs/export/export-documents.md | 20 ++++++++++++++++++ docs/import/import-files/generic-files.md | 21 +++++++++++++++++++ .../flux/api/GenericFilesExporter.java | 3 +++ .../flux/api/GenericFilesImporter.java | 3 +++ 4 files changed, 47 insertions(+) diff --git a/docs/export/export-documents.md b/docs/export/export-documents.md index b7614772..6efde723 100644 --- a/docs/export/export-documents.md +++ b/docs/export/export-documents.md @@ -133,6 +133,26 @@ bin\flux export-files ^ {% endtabs %} +## Exporting large binary files + +MarkLogic's [support for large binary documents](https://docs.marklogic.com/guide/app-dev/binaries#id_93203) allows +for storing binary files of any size. To ensure that large binary files can be exported to a file path, consider +using the `--streaming` option introduced in Flux 1.0.1. When this option is set, Flux will stream each document +from MarkLogic directly to the file path, thereby avoiding reading the contents of a file into memory. + +As streaming to a file requires Flux to retrieve one document at a time from MarkLogic, you should not use this option +when exporting smaller documents that can easily fit into the memory available to Flux. + +When using `--streaming`, the following options will behave in a different fashion: + +- `--batch-size` will still affect how many URIs are retrieved from MarkLogic in a single request, but will not impact +the number of documents retrieved from MarkLogic in a single request, which will always be 1. +- `--encoding` will be ignored as applying an encoding requires reading the document into memory. +- `--pretty-print` will have no effect as the contents of a document will never be read into memory. + +You typically will also not want to use the `--transform` option as applying a REST transform in MarkLogic to a +very large binary may exhaust the amount of memory available to MarkLogic. + ## Understanding partitions As Flux is built on top of Apache Spark, it is heavily influenced by how Spark diff --git a/docs/import/import-files/generic-files.md b/docs/import/import-files/generic-files.md index e0113ac9..d7e5538d 100644 --- a/docs/import/import-files/generic-files.md +++ b/docs/import/import-files/generic-files.md @@ -84,6 +84,27 @@ bin\flux import-files ^ {% endtabs %} +## Importing large binary files + +Flux can leverage MarkLogic's [support for large binary documents](https://docs.marklogic.com/guide/app-dev/binaries#id_93203) +by importing binary files of any size. To ensure that binary files of any size can be loaded, consider using the +`--streaming` option introduced in Flux 1.0.1. When this option is set, Flux will stream the contents of each file from +its source directly into MarkLogic, thereby avoiding reading the contents of a file into memory. + +As streaming a file requires Flux to only send one document at a time to MarkLogic, you should not use this option when +importing smaller files that easily fit into the memory available to Flux. + +When using `--streaming`, the following options will have no effect due to Flux not reading the file contents into +memory and always sending one file per request to MarkLogic: + +- `--batch-size` +- `--encoding` +- `--failed-documents-path` +- `--uri-template` + +You typically will also not want to use the `--transform` option as applying a REST transform in MarkLogic to a very +large binary document may exhaust the amount of memory available to MarkLogic. + ## Importing Gzip files To import Gzip files with each file being decompressed before written to MarkLogic, include the `--compression` option diff --git a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java index e4cf0811..b9fdc84f 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java +++ b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java @@ -32,6 +32,9 @@ interface WriteGenericFilesOptions { GenericFilesExporter from(Consumer> consumer); + /** + * @since 1.0.1 + */ GenericFilesExporter streaming(); GenericFilesExporter to(Consumer consumer); diff --git a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java index 555b666d..ef35e6f6 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java +++ b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java @@ -30,6 +30,9 @@ interface WriteGenericDocumentsOptions extends WriteDocumentsOptions consumer); From 928b32c93ea217bbb50b92bd9c5952f6e4882925 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Wed, 25 Sep 2024 13:13:47 -0400 Subject: [PATCH 05/14] MLE-17085 Added docs and test for serialized queries --- docs/copy.md | 4 +- docs/export/export-documents.md | 69 +++++++++++++++++++ .../flux/impl/export/ExportFilesTest.java | 15 ++++ .../options-files/cts-query-json.txt | 2 + 4 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 flux-cli/src/test/resources/options-files/cts-query-json.txt diff --git a/docs/copy.md b/docs/copy.md index 402bacbb..7c2edfd8 100644 --- a/docs/copy.md +++ b/docs/copy.md @@ -32,8 +32,8 @@ The following options control which documents are read from MarkLogic: You must specify at least one of `--collections`, `--directory`, `--query`, `--string-query`, or `--uris`. You may specify any combination of those options as well, with the exception that `--query` will be ignored if `--uris` is specified. -For examples of what the `--query` option support, please see -[the MarkLogic search documentation](https://docs.marklogic.com/guide/rest-dev/search#id_49329). +If you use `--query` to select documents, please see [the guide on exporting documents](export/export-documents.md) for +more information and examples of the types of values you can provide to `--query`. The `copy` command then requires that you specify connection information for the output database that the documents will be copied into. Each of the [connection options](common-options.md) can be used for this output database, but with diff --git a/docs/export/export-documents.md b/docs/export/export-documents.md index 6efde723..510201cc 100644 --- a/docs/export/export-documents.md +++ b/docs/export/export-documents.md @@ -53,6 +53,75 @@ The following options control which documents are selected to be exported: You must specify at least one of `--collections`, `--directory`, `--query`, `--string-query`, or `--uris`. You may specify any combination of those options as well, with the exception that `--query` will be ignored if `--uris` is specified. +## Specifying a query + +The `--query` accepts any one of the following inputs: + +1. A [structured query](https://docs.marklogic.com/guide/search-dev/structured-query#). +2. A [CTS query](https://docs.marklogic.com/guide/rest-dev/search#id_30577). +3. A [combined query](https://docs.marklogic.com/guide/rest-dev/search#id_69918). + +The type of query you select can then be expressed in either JSON or XML. The documentation links above provide +complete details on constructing each type of query, but for convenience, an example of each query is +shown next. + +A structured query: + +``` +# JSON +{"query": {"term-query": {"text": "hello"}}} + +# XML +hello +``` + +A CTS query: + +``` +# JSON +{"ctsquery": {"wordQuery": {"text": "hello"}}} + +# XML +hello +``` + +A combined query, with options included: + +``` +# JSON +{"search": {"options": {"constraint": {"name": "c1", "word": {"element": {"name": "text"}}}}}, "qtext": "c1:hello"} + +# XML +\ +\ +c1:hello +``` + +### Specifying a query in an options file + +Serialized queries can be very lengthy, and thus it is often easier to put the `--query` option and its value in an +[options file](../common-options.md). + +For queries expressed as JSON, you will need to ensure that the double quotes in your JSON are escaped correctly. +For example: + +``` +--query +"{\"ctsquery\": {\"wordQuery\": {\"text\": \"hello\"}}}" +``` + +As noted in the [options file guide](../common-options.md), you can use a newline symbol specific to the shell +you use for running Flux to break the value into multiple lines: + +``` +--query +"{\"ctsquery\": \ +{\"wordQuery\": {\"text\": \"hello\"}}}" +``` + +For queries expressed in XML, you may find it easier to use single quotes instead of double quotes, as single quotes +do not require any escaping. + ## Transforming document content You can apply a [MarkLogic REST transform](https://docs.marklogic.com/guide/rest-dev/transforms) diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportFilesTest.java index 48b47570..d367b9a2 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportFilesTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/export/ExportFilesTest.java @@ -51,6 +51,21 @@ void exportToRegularFiles(@TempDir Path tempDir) throws Exception { } } + @Test + void exportViaQueryInOptionsFile(@TempDir Path tempDir) { + run( + "export-files", + "--path", tempDir.toFile().getAbsolutePath(), + "--connection-string", makeConnectionString(), + "@src/test/resources/options-files/cts-query-json.txt" + ); + + File dir = tempDir.toFile(); + File authorsDir = new File(dir, "author"); + assertEquals(1, authorsDir.listFiles().length, "Expecting the query to only retrieve the 'Vivienne' author."); + } + + @Test void exportViaUris(@TempDir Path tempDir) { run( diff --git a/flux-cli/src/test/resources/options-files/cts-query-json.txt b/flux-cli/src/test/resources/options-files/cts-query-json.txt new file mode 100644 index 00000000..fe31b59d --- /dev/null +++ b/flux-cli/src/test/resources/options-files/cts-query-json.txt @@ -0,0 +1,2 @@ +--query +"{\"ctsquery\": {\"wordQuery\": {\"text\": \"Vivianne\"}}}" From 1766b9b6a97fa6947addeebe0eaeeea0303bb26d Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Wed, 25 Sep 2024 14:33:15 -0400 Subject: [PATCH 06/14] Added note about URI encoding --- docs/import/import-files/generic-files.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/import/import-files/generic-files.md b/docs/import/import-files/generic-files.md index d7e5538d..215e61e6 100644 --- a/docs/import/import-files/generic-files.md +++ b/docs/import/import-files/generic-files.md @@ -105,6 +105,11 @@ memory and always sending one file per request to MarkLogic: You typically will also not want to use the `--transform` option as applying a REST transform in MarkLogic to a very large binary document may exhaust the amount of memory available to MarkLogic. +In addition, when streaming documents to MarkLogic, URIs will be encoded. For example, a file named `my file.json` +will result in a URI of `/my%20file.json`. This is due to an +[issue in the MarkLogic REST API endpoint](https://docs.marklogic.com/REST/PUT/v1/documents) that will be resolved in +a future server release. + ## Importing Gzip files To import Gzip files with each file being decompressed before written to MarkLogic, include the `--compression` option From 9e0f709645562e09b69afa8d813401b6d1d0485a Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Thu, 26 Sep 2024 16:29:40 -0400 Subject: [PATCH 07/14] MLE-17095 Can now stream archives Also gave flux-test-user the spark-user-role to ensure it can read/update the files in the canned archive files. --- docs/export/export-documents.md | 2 +- docs/export/export-rows.md | 2 +- docs/import/import-files/generic-files.md | 2 +- .../flux/api/ArchiveFilesExporter.java | 5 +++ .../flux/api/ArchiveFilesImporter.java | 5 +++ .../flux/api/GenericFilesExporter.java | 2 +- .../flux/api/GenericFilesImporter.java | 2 +- .../export/ExportArchiveFilesCommand.java | 40 ++++++++++++++++-- .../importdata/ImportArchiveFilesCommand.java | 16 +++++++ .../impl/importdata/ImportFilesCommand.java | 14 ++---- .../flux/impl/importdata/ReadFilesParams.java | 11 +++++ .../impl/importdata/WriteDocumentParams.java | 9 +++- .../ImportArchiveFilesOptionsTest.java | 18 ++++++++ .../importdata/ImportArchiveFilesTest.java | 5 +-- .../archive-files/invalid-archive.zip | Bin 0 -> 843 bytes gradle.properties | 2 +- .../security/users/flux-test-user.json | 3 +- 17 files changed, 114 insertions(+), 24 deletions(-) create mode 100644 flux-cli/src/test/resources/archive-files/invalid-archive.zip diff --git a/docs/export/export-documents.md b/docs/export/export-documents.md index 510201cc..802f45a0 100644 --- a/docs/export/export-documents.md +++ b/docs/export/export-documents.md @@ -206,7 +206,7 @@ bin\flux export-files ^ MarkLogic's [support for large binary documents](https://docs.marklogic.com/guide/app-dev/binaries#id_93203) allows for storing binary files of any size. To ensure that large binary files can be exported to a file path, consider -using the `--streaming` option introduced in Flux 1.0.1. When this option is set, Flux will stream each document +using the `--streaming` option introduced in Flux 1.1.0. When this option is set, Flux will stream each document from MarkLogic directly to the file path, thereby avoiding reading the contents of a file into memory. As streaming to a file requires Flux to retrieve one document at a time from MarkLogic, you should not use this option diff --git a/docs/export/export-rows.md b/docs/export/export-rows.md index 936e4c02..1577016c 100644 --- a/docs/export/export-rows.md +++ b/docs/export/export-rows.md @@ -311,7 +311,7 @@ location where data already exists. This option supports the following values: For convenience, the above values are case-sensitive so that you can ignore casing when choosing a value. -As of the 1.0.1 release of Flux, `--mode` defaults to `Append` for commands that write to a filesystem. In the 1.0.0 +As of the 1.1.0 release of Flux, `--mode` defaults to `Append` for commands that write to a filesystem. In the 1.0.0 release, these commands defaulted to `Overwrite`. The `export-jdbc` command defaults to `ErrorIfExists` avoid altering an existing table in any way. diff --git a/docs/import/import-files/generic-files.md b/docs/import/import-files/generic-files.md index 215e61e6..a2235918 100644 --- a/docs/import/import-files/generic-files.md +++ b/docs/import/import-files/generic-files.md @@ -88,7 +88,7 @@ bin\flux import-files ^ Flux can leverage MarkLogic's [support for large binary documents](https://docs.marklogic.com/guide/app-dev/binaries#id_93203) by importing binary files of any size. To ensure that binary files of any size can be loaded, consider using the -`--streaming` option introduced in Flux 1.0.1. When this option is set, Flux will stream the contents of each file from +`--streaming` option introduced in Flux 1.1.0. When this option is set, Flux will stream the contents of each file from its source directly into MarkLogic, thereby avoiding reading the contents of a file into memory. As streaming a file requires Flux to only send one document at a time to MarkLogic, you should not use this option when diff --git a/flux-cli/src/main/java/com/marklogic/flux/api/ArchiveFilesExporter.java b/flux-cli/src/main/java/com/marklogic/flux/api/ArchiveFilesExporter.java index f58417fd..222e7e64 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/api/ArchiveFilesExporter.java +++ b/flux-cli/src/main/java/com/marklogic/flux/api/ArchiveFilesExporter.java @@ -20,6 +20,11 @@ interface WriteArchiveFilesOptions extends WriteFilesOptions consumer); + /** + * @since 1.1.0 + */ + ArchiveFilesExporter streaming(); + ArchiveFilesExporter to(Consumer consumer); ArchiveFilesExporter to(String path); diff --git a/flux-cli/src/main/java/com/marklogic/flux/api/ArchiveFilesImporter.java b/flux-cli/src/main/java/com/marklogic/flux/api/ArchiveFilesImporter.java index b2ad311d..ae635ed9 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/api/ArchiveFilesImporter.java +++ b/flux-cli/src/main/java/com/marklogic/flux/api/ArchiveFilesImporter.java @@ -22,5 +22,10 @@ interface ReadArchiveFilesOptions extends ReadFilesOptions> consumer); } diff --git a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java index b9fdc84f..28bd75b4 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java +++ b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesExporter.java @@ -33,7 +33,7 @@ interface WriteGenericFilesOptions { GenericFilesExporter from(Consumer> consumer); /** - * @since 1.0.1 + * @since 1.1.0 */ GenericFilesExporter streaming(); diff --git a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java index ef35e6f6..78ef7740 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java +++ b/flux-cli/src/main/java/com/marklogic/flux/api/GenericFilesImporter.java @@ -31,7 +31,7 @@ interface WriteGenericDocumentsOptions extends WriteDocumentsOptions loadDataset(SparkSession session, DataFrameReader reader) } return reader.format(MARKLOGIC_CONNECTOR) .options(getConnectionParams().makeOptions()) - .options(readParams.makeOptions()) + .options(makeReadOptions()) .load(); } @@ -55,12 +62,33 @@ protected Dataset loadDataset(SparkSession session, DataFrameReader reader) protected void applyWriter(SparkSession session, DataFrameWriter writer) { writeParams.getS3Params().addToHadoopConfiguration(session.sparkContext().hadoopConfiguration()); writer.format(MARKLOGIC_CONNECTOR) - .options(writeParams.get()) - // The connector only supports "Append" in terms of how Spark defines it, but it will always overwrite files. + .options(makeWriteOptions()) .mode(SaveMode.Append) .save(writeParams.getPath()); } + // Extracted for unit-testing. + protected final Map makeReadOptions() { + Map readOptions = readParams.makeOptions(); + if (streaming) { + readOptions.put(Options.STREAM_FILES, "true"); + } + return readOptions; + } + + // Extracted for unit-testing. + protected Map makeWriteOptions() { + Map writeOptions = writeParams.get(); + if (streaming) { + writeOptions.put(Options.STREAM_FILES, "true"); + // The writer needs to know what metadata to retrieve when streaming. + writeOptions.put(Options.READ_DOCUMENTS_CATEGORIES, readParams.determineCategories()); + } + // Need connection params so writer can read documents and metadata from MarkLogic. + writeOptions.putAll(getConnectionParams().makeOptions()); + return writeOptions; + } + public static class WriteArchiveFilesParams extends WriteFilesParams implements WriteArchiveFilesOptions { @CommandLine.Option(names = "--encoding", description = "Specify an encoding for writing files.") @@ -120,6 +148,12 @@ public ArchiveFilesExporter from(Consumer consumer) return this; } + @Override + public ArchiveFilesExporter streaming() { + this.streaming = true; + return this; + } + @Override public ArchiveFilesExporter to(Consumer consumer) { consumer.accept(writeParams); diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesCommand.java index 25ed0510..c945aa45 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesCommand.java +++ b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesCommand.java @@ -27,13 +27,23 @@ public class ImportArchiveFilesCommand extends AbstractImportFilesCommand> consumer) { consumer.accept(writeParams); diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportFilesCommand.java index e6bb064b..0441ea9e 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportFilesCommand.java +++ b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ImportFilesCommand.java @@ -45,13 +45,13 @@ protected String getReadFormat() { @Override protected ReadFilesParams getReadParams() { - readParams.streaming = this.streaming; + readParams.setStreaming(this.streaming); return readParams; } @Override protected WriteDocumentParams getWriteParams() { - writeParams.streaming = this.streaming; + writeParams.setStreaming(this.streaming); return writeParams; } @@ -86,8 +86,6 @@ public static class ReadGenericFilesParams extends ReadFilesParams makeOptions() { return OptionsUtil.addOptions(super.makeOptions(), Options.READ_NUM_PARTITIONS, OptionsUtil.intOption(partitions), Options.READ_FILES_COMPRESSION, compressionType != null ? compressionType.name() : null, - Options.READ_FILES_ENCODING, encoding, - Options.STREAM_FILES, streaming ? "true" : null + Options.READ_FILES_ENCODING, encoding ); } @@ -121,8 +118,6 @@ public static class WriteGenericDocumentsParams extends WriteDocumentParams makeOptions() { return OptionsUtil.addOptions(super.makeOptions(), - Options.WRITE_DOCUMENT_TYPE, documentType != null ? documentType.name() : null, - Options.STREAM_FILES, streaming ? "true" : null + Options.WRITE_DOCUMENT_TYPE, documentType != null ? documentType.name() : null ); } } diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ReadFilesParams.java b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ReadFilesParams.java index 3a982480..af0542b7 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ReadFilesParams.java +++ b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/ReadFilesParams.java @@ -39,6 +39,8 @@ public class ReadFilesParams implements ReadFilesOpt @CommandLine.Mixin private S3Params s3Params = new S3Params(); + private boolean streaming; + public boolean hasAtLeastOnePath() { return path != null && !path.isEmpty(); } @@ -57,6 +59,11 @@ public Map makeOptions() { if (recursiveFileLookup) { options.put("recursiveFileLookup", "true"); } + + if (streaming) { + options.put(Options.STREAM_FILES, "true"); + } + return options; } @@ -143,4 +150,8 @@ public T s3Endpoint(String endpoint) { this.s3Params.setEndpoint(endpoint); return (T) this; } + + public void setStreaming(boolean streaming) { + this.streaming = streaming; + } } diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/WriteDocumentParams.java b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/WriteDocumentParams.java index b6c1d6c0..86e1be28 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/WriteDocumentParams.java +++ b/flux-cli/src/main/java/com/marklogic/flux/impl/importdata/WriteDocumentParams.java @@ -120,6 +120,8 @@ public class WriteDocumentParams implements Wri ) private String uriTemplate; + private boolean streaming; + public Map makeOptions() { return OptionsUtil.makeOptions( Options.WRITE_ABORT_ON_FAILURE, abortOnWriteFailure ? "true" : "false", @@ -137,7 +139,8 @@ public Map makeOptions() { Options.WRITE_URI_PREFIX, uriPrefix, Options.WRITE_URI_REPLACE, uriReplace, Options.WRITE_URI_SUFFIX, uriSuffix, - Options.WRITE_URI_TEMPLATE, uriTemplate + Options.WRITE_URI_TEMPLATE, uriTemplate, + Options.STREAM_FILES, streaming ? "true" : null ); } @@ -247,4 +250,8 @@ public T uriTemplate(String uriTemplate) { this.uriTemplate = uriTemplate; return (T) this; } + + public void setStreaming(boolean streaming) { + this.streaming = streaming; + } } diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesOptionsTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesOptionsTest.java index c3a33901..0dcee34a 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesOptionsTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesOptionsTest.java @@ -24,4 +24,22 @@ void test() { Options.READ_FILES_ENCODING, "UTF-16" ); } + + @Test + void streaming() { + ImportArchiveFilesCommand command = (ImportArchiveFilesCommand) getCommand( + "import-archive-files", + "--path", "src/test/resources/archive-files", + "--streaming" + ); + + assertOptions(command.getReadParams().makeOptions(), + Options.STREAM_FILES, "true", + Options.READ_FILES_TYPE, "archive" + ); + + assertOptions(command.getWriteParams().makeOptions(), + Options.STREAM_FILES, "true" + ); + } } diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesTest.java index f25b9eed..c1849d87 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportArchiveFilesTest.java @@ -87,14 +87,13 @@ void dontAbortOnReadFailureByDefault() { void abortOnReadFailure() { String stderr = runAndReturnStderr(() -> run( "import-archive-files", - "--path", "src/test/resources/archive-files", - "--path", "src/test/resources/mlcp-archives", + "--path", "src/test/resources/archive-files/invalid-archive.zip", "--abort-on-read-failure", "--connection-string", makeConnectionString() )); assertTrue( - stderr.contains("Command failed, cause: Could not find metadata entry for entry /test/1.xml.metadata in file"), + stderr.contains("Command failed, cause: Could not find metadata entry for entry test/1.xml in file"), "Unexpected stderr: " + stderr ); } diff --git a/flux-cli/src/test/resources/archive-files/invalid-archive.zip b/flux-cli/src/test/resources/archive-files/invalid-archive.zip new file mode 100644 index 0000000000000000000000000000000000000000..62029ad1c4888bfd9b3c14d7054422b15ae1eb48 GIT binary patch literal 843 zcmWIWW@Zs#;Nak3m|fc#!GHv~7+4reQj1IU4fQH=bN-!TVA#BI{fV=I8s55E=gyqp z9BOpMI3UR5toJ$plV`lQhjQ^2Ufvq8Y5U_%Cq2%dJh@%_y#6WAjThSs7y`W6Idpc{ zuv-94w+7+>Ja#fb>@)(|NqhT*fvy4j5p17cZfZ$lN@7Xk+{p)vniT}v;*WlOFRFATCA56a$cSKs1q8s z%KFkmL9U-x8P&@tAx|vb=D*LE|H9L>WR2;xEh1spi-T>>JE=Z* zSSx+&jmKWQN6fdx?=}9`i8Flp;r+UhjnTbAY)T^Ee-~fxR@X1HEQ?*9aJ|px zWTBmU?1B67YIFIF9i?`%g}#a0$j0|#-i(j4EHd~4Hy{1V4@wILk-4X%85tM^SU_ok zkx7IBkphv^2`B}k0@U<~UoW!G5|nsG*N5yLP>Mi+CqO2a^bz3A$_7%z41`;ObORHJ F2LKf^Bq;y@ literal 0 HcmV?d00001 diff --git a/gradle.properties b/gradle.properties index c8a61b98..0b4dfaf7 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,4 +1,4 @@ -version=1.0.1-SNAPSHOT +version=1.1.0-SNAPSHOT # Define these on the command line to publish to OSSRH # See https://central.sonatype.org/publish/publish-gradle/#credentials for more information diff --git a/test-app/src/main/ml-config/security/users/flux-test-user.json b/test-app/src/main/ml-config/security/users/flux-test-user.json index c7eb26a6..45c6fe50 100644 --- a/test-app/src/main/ml-config/security/users/flux-test-user.json +++ b/test-app/src/main/ml-config/security/users/flux-test-user.json @@ -2,6 +2,7 @@ "user-name": "flux-test-user", "password": "password", "role": [ - "flux-test-role" + "flux-test-role", + "spark-user-role" ] } From 32d5143903362e1defa75b6d666b10a45df4adcb Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Sun, 29 Sep 2024 21:01:54 -0400 Subject: [PATCH 08/14] MLE-17095 Docs for streaming archives --- docs/export/export-archives.md | 19 +++++++++++++++++++ docs/export/export-documents.md | 8 ++++---- docs/import/import-files/archives.md | 27 +++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/docs/export/export-archives.md b/docs/export/export-archives.md index cfa86f4d..00155441 100644 --- a/docs/export/export-archives.md +++ b/docs/export/export-archives.md @@ -121,3 +121,22 @@ bin\flux export-archives ^ The encoding will be used for both document and metadata entries in each archive zip file. + +## Exporting large binary files + +Similar to [exporting large binary documents as files](export-documents.md), you can include large binary documents +in archives by including the `--streaming` option introduced in Flux 1.1.0. When this option is set, Flux will stream +each document from MarkLogic directly to a zip file, thereby avoiding reading the contents of a file into memory. + +As streaming to an archive requires Flux to retrieve one document at a time from MarkLogic, you should not use this option +when exporting smaller documents that can easily fit into the memory available to Flux. + +When using `--streaming`, the following options will behave in a different fashion: + +- `--batch-size` will still affect how many URIs are retrieved from MarkLogic in a single request, but will not impact + the number of documents retrieved from MarkLogic in a single request, which will always be 1. +- `--encoding` will be ignored as applying an encoding requires reading the document into memory. +- `--pretty-print` will have no effect as the contents of a document will never be read into memory. + +You typically will not want to use the `--transform` option as applying a REST transform in MarkLogic to a +large binary document may exhaust the amount of memory available to MarkLogic. diff --git a/docs/export/export-documents.md b/docs/export/export-documents.md index 802f45a0..d044274b 100644 --- a/docs/export/export-documents.md +++ b/docs/export/export-documents.md @@ -202,10 +202,10 @@ bin\flux export-files ^ {% endtabs %} -## Exporting large binary files +## Exporting large binary documents MarkLogic's [support for large binary documents](https://docs.marklogic.com/guide/app-dev/binaries#id_93203) allows -for storing binary files of any size. To ensure that large binary files can be exported to a file path, consider +for storing binary files of any size. To ensure that large binary documents can be exported to a file path, consider using the `--streaming` option introduced in Flux 1.1.0. When this option is set, Flux will stream each document from MarkLogic directly to the file path, thereby avoiding reading the contents of a file into memory. @@ -219,8 +219,8 @@ the number of documents retrieved from MarkLogic in a single request, which will - `--encoding` will be ignored as applying an encoding requires reading the document into memory. - `--pretty-print` will have no effect as the contents of a document will never be read into memory. -You typically will also not want to use the `--transform` option as applying a REST transform in MarkLogic to a -very large binary may exhaust the amount of memory available to MarkLogic. +You typically will not want to use the `--transform` option as applying a REST transform in MarkLogic to a +large binary document may exhaust the amount of memory available to MarkLogic. ## Understanding partitions diff --git a/docs/import/import-files/archives.md b/docs/import/import-files/archives.md index 31a81752..3025ffee 100644 --- a/docs/import/import-files/archives.md +++ b/docs/import/import-files/archives.md @@ -115,3 +115,30 @@ bin\flux import-archive-files ^ ``` {% endtab %} {% endtabs %} + +## Importing large binary files in archives + +When [exporting archives](../../export/export-archives.md), you can use the `--streaming` option introduced in Flux +1.1.0 to ensure that large binary documents in MarkLogic can be streamed to an archive file. When importing archives +with large binary files, you should likewise use the `--streaming` option to ensure that each large binary can be read +into MarkLogic without exhausting the memory available to Flux or MarkLogic. + +As streaming each entry requires Flux to only send one document at a time to MarkLogic, you should not use this option when +importing smaller files that easily fit into the memory available to Flux. + +When using `--streaming`, the following options will have no effect due to Flux not reading the file contents into +memory and always sending one file per request to MarkLogic: + +- `--batch-size` +- `--encoding` +- `--failed-documents-path` +- `--uri-template` + +You typically will also not want to use the `--transform` option as applying a REST transform in MarkLogic to a very +large binary document may exhaust the amount of memory available to MarkLogic. + +In addition, when streaming documents to MarkLogic, URIs will be encoded. For example, an entry named `/my file.json` +will result in a URI of `/my%20file.json`. This is due to an +[issue in the MarkLogic REST API endpoint](https://docs.marklogic.com/REST/PUT/v1/documents) that will be resolved in +a future server release. + From 09cf5ab3c6a26169a65786b6cbf4c96d2a47a8a8 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Mon, 30 Sep 2024 16:21:41 -0400 Subject: [PATCH 09/14] Upgrading Sonar Fixed some existing issues too --- .gitignore | 1 + CONTRIBUTING.md | 23 ++++++++------- docker-compose.yml | 2 +- flux-cli/build.gradle | 11 ++++++-- .../java/com/marklogic/flux/AbstractTest.java | 28 ++++++++++++++++--- .../flux/api/OrcFilesImporterTest.java | 1 - .../flux/impl/ErrorMessagesTest.java | 2 ++ .../flux/impl/importdata/ImportFilesTest.java | 8 +++--- .../impl/importdata/ImportOrcFilesTest.java | 4 +-- 9 files changed, 53 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 87e26873..bbaaa0ab 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ flux/conf flux/export export flux-version.properties +docker/sonarqube diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2291f6aa..3804eb0c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,10 +1,11 @@ To contribute to this project, complete these steps to setup a MarkLogic instance via Docker with a test application installed: -1. Clone this repository if you have not already. -2. From the root directory of the project, run `docker-compose up -d --build`. -3. Wait 10 to 20 seconds and verify that shows the MarkLogic admin screen before proceeding. -4. Run `./gradlew -i mlDeploy` to deploy this project's test application (note that Java 11 or Java 17 is required). +1. Ensure you have Java 11 or higher installed; you will need Java 17 if you wish to use the Sonarqube support described below. +2. Clone this repository if you have not already. +3. From the root directory of the project, run `docker-compose up -d --build`. +4. Wait 10 to 20 seconds and verify that shows the MarkLogic admin screen before proceeding. +5. Run `./gradlew -i mlDeploy` to deploy this project's test application. Some of the tests depend on the Postgres instance deployed via Docker. Follow these steps to load a sample dataset into it: @@ -57,10 +58,7 @@ publishing a local snapshot of our Spark connector. Then just run: ./gradlew clean test -You can run the tests using either Java 11 or Java 17. - -In Intellij, the tests will run with Java 11. In order to run the tests in Intellij using Java 17, -perform the following steps: +If you are running the tests in Intellij with Java 17, you will need to perform the following steps: 1. Go to Run -> Edit Configurations in the Intellij toolbar. 2. Click on "Edit configuration templates". @@ -81,7 +79,7 @@ delete that configuration first via the "Run -> Edit Configurations" panel. ## Generating code quality reports with SonarQube In order to use SonarQube, you must have used Docker to run this project's `docker-compose.yml` file, and you must -have the services in that file running. +have the services in that file running. You must also use Java 17 to run the `sonar` Gradle task. To configure the SonarQube service, perform the following steps: @@ -97,8 +95,8 @@ To configure the SonarQube service, perform the following steps: 10. Add `systemProp.sonar.token=your token pasted here` to `gradle-local.properties` in the root of your project, creating that file if it does not exist yet. -To run SonarQube, run the following Gradle tasks, which will run all the tests with code coverage and then generate -a quality report with SonarQube: +To run SonarQube, run the following Gradle tasks with Java 17 or higher, which will run all the tests with code +coverage and then generate a quality report with SonarQube: ./gradlew test sonar @@ -116,7 +114,8 @@ before, then SonarQube will show "New Code" by default. That's handy, as you can you've introduced on the feature branch you're working on. You can then click on "Overall Code" to see all issues. Note that if you only need results on code smells and vulnerabilities, you can repeatedly run `./gradlew sonar` -without having to re-run the tests. +without having to re-run the tests. If you get an error from Sonar about Java sources, you just need to compile the +Java code, so run `./gradlew compileTestJava sonar`. ## Testing the documentation locally diff --git a/docker-compose.yml b/docker-compose.yml index 30ff14dd..d06e000e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -55,7 +55,7 @@ services: # Copied from https://docs.sonarsource.com/sonarqube/latest/setup-and-upgrade/install-the-server/#example-docker-compose-configuration . sonarqube: - image: sonarqube:10.3.0-community + image: sonarqube:10.6.0-community depends_on: - postgres environment: diff --git a/flux-cli/build.gradle b/flux-cli/build.gradle index 7a786f95..94a22ecc 100644 --- a/flux-cli/build.gradle +++ b/flux-cli/build.gradle @@ -2,7 +2,7 @@ plugins { id 'net.saliman.properties' version '1.5.2' id "application" id "jacoco" - id "org.sonarqube" version "4.4.1.3373" + id "org.sonarqube" version "5.1.0.4882" id 'com.github.johnrengelman.shadow' version '8.1.1' id 'maven-publish' } @@ -62,7 +62,7 @@ dependencies { testImplementation "com.databricks:spark-xml_2.12:0.18.0" // For configuring two-way SSL in tests. - testImplementation("com.marklogic:ml-app-deployer:4.8.0") { + testImplementation("com.marklogic:ml-app-deployer:5.0.0") { // Excluding Jackson so that we use whatever Jackson version is required by Spark. exclude group: "com.fasterxml.jackson.core" exclude group: "com.fasterxml.jackson.dataformat" @@ -70,7 +70,12 @@ dependencies { exclude module: "marklogic-client-api" } - shadowDependencies "com.marklogic:marklogic-spark-connector:2.3.1" + // Using Apache HttpClient for connecting to the MarkLogic Manage API. + testImplementation 'org.apache.httpcomponents:httpclient:4.5.14' + // Forcing HttpClient to use this to address https://snyk.io/vuln/SNYK-JAVA-COMMONSCODEC-561518 . + testImplementation 'commons-codec:commons-codec:1.17.1' + + shadowDependencies "com.marklogic:marklogic-spark-connector:2.3-SNAPSHOT" shadowDependencies "info.picocli:picocli:4.7.6" } diff --git a/flux-cli/src/test/java/com/marklogic/flux/AbstractTest.java b/flux-cli/src/test/java/com/marklogic/flux/AbstractTest.java index 6f82e237..90fda13d 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/AbstractTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/AbstractTest.java @@ -10,10 +10,15 @@ import com.marklogic.junit5.AbstractMarkLogicTest; import com.marklogic.mgmt.ManageClient; import com.marklogic.mgmt.ManageConfig; -import com.marklogic.rest.util.RestTemplateUtil; +import com.marklogic.rest.util.MgmtResponseErrorHandler; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.impl.client.BasicCredentialsProvider; +import org.apache.http.impl.client.HttpClientBuilder; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterEach; import org.springframework.core.io.ClassPathResource; +import org.springframework.http.client.HttpComponentsClientHttpRequestFactory; import org.springframework.web.client.RestTemplate; import java.io.ByteArrayOutputStream; @@ -151,17 +156,32 @@ protected final FluxException assertThrowsFluxException(Runnable r) { return assertThrows(FluxException.class, () -> r.run()); } + /** + * Constructs a ManageClient using the "old" Apache HttpClient approach. This avoids classpath issues due to the + * relocated OkHttp classes in the connector jar. This code was copied from ml-gradle before its 5.0.0 release; + * it was removed from the 5.0.0 release as it had been deprecated for a while. + * + * @return + */ protected final ManageClient newManageClient() { Properties props = loadTestProperties(); // Forcing usage of the deprecated Apache HttpClient to avoid classpath issues with the relocated OkHttp classes // in our connector jar. - ManageConfig manageConfig = new ManageConfig(props.getProperty("marklogic.client.host"), 8002, + ManageConfig config = new ManageConfig(props.getProperty("marklogic.client.host"), 8002, props.getProperty("marklogic.client.username"), props.getProperty("marklogic.client.password") ); - RestTemplate restTemplate = RestTemplateUtil.newRestTemplate(manageConfig, RestTemplateUtil.DEFAULT_CONFIGURERS); + + HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); + BasicCredentialsProvider provider = new BasicCredentialsProvider(); + provider.setCredentials(new AuthScope(config.getHost(), config.getPort(), AuthScope.ANY_REALM), + new UsernamePasswordCredentials(config.getUsername(), config.getPassword())); + httpClientBuilder.setDefaultCredentialsProvider(provider); + + RestTemplate restTemplate = new RestTemplate(new HttpComponentsClientHttpRequestFactory(httpClientBuilder.build())); + restTemplate.setErrorHandler(new MgmtResponseErrorHandler()); ManageClient manageClient = new ManageClient(restTemplate); - manageClient.setManageConfig(manageConfig); + manageClient.setManageConfig(config); return manageClient; } } diff --git a/flux-cli/src/test/java/com/marklogic/flux/api/OrcFilesImporterTest.java b/flux-cli/src/test/java/com/marklogic/flux/api/OrcFilesImporterTest.java index 0a605b7e..ac127ac8 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/api/OrcFilesImporterTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/api/OrcFilesImporterTest.java @@ -9,7 +9,6 @@ import java.util.Map; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; class OrcFilesImporterTest extends AbstractTest { diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/ErrorMessagesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/ErrorMessagesTest.java index 8414ddde..40430e82 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/ErrorMessagesTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/ErrorMessagesTest.java @@ -12,6 +12,8 @@ class ErrorMessagesTest { + // Don't need Sonar to complain about WRITE_FILE_ROWS_DOCUMENT_TYPE being deprecated, we are aware of that. + @SuppressWarnings("java:S5738") @Test void verifyEachKeyIsOverridden() { ResourceBundle bundle = ResourceBundle.getBundle("marklogic-spark-messages"); diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesTest.java index 58b2582f..d769701c 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportFilesTest.java @@ -20,7 +20,7 @@ class ImportFilesTest extends AbstractTest { - String[] uris = new String[]{"/hello.json", "/hello.txt", "/hello.xml", "/hello2.txt.gz"}; + private static final String[] MIXED_FILES_URIS = new String[]{"/hello.json", "/hello.txt", "/hello.xml", "/hello2.txt.gz"}; @Test void multiplePaths() { @@ -42,7 +42,7 @@ void multiplePaths() { "--streaming" ); - verifyDocsWereWritten(uris.length, uris); + verifyDocsWereWritten(MIXED_FILES_URIS.length, MIXED_FILES_URIS); } @Test @@ -96,7 +96,7 @@ void withUsernameAndPasswordAndAuthType() { "--uri-replace", ".*/mixed-files,''" ); - verifyDocsWereWritten(uris.length, uris); + verifyDocsWereWritten(MIXED_FILES_URIS.length, MIXED_FILES_URIS); } @Test @@ -147,7 +147,7 @@ void fileOptions(@TempDir Path tempDir) throws IOException { "--collections", "files" ); - verifyDocsWereWritten(uris.length, uris); + verifyDocsWereWritten(MIXED_FILES_URIS.length, MIXED_FILES_URIS); } @Test diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportOrcFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportOrcFilesTest.java index 2b55c15a..59d1e9cd 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportOrcFilesTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportOrcFilesTest.java @@ -70,8 +70,8 @@ void aggregate() { JsonNode doc = readJsonDocument("/orc-test/" + i + ".json"); assertEquals(i, doc.get("CitationID").asInt()); ArrayNode names = (ArrayNode) doc.get("names"); - for (int j = 0; i < names.size(); i++) { - JsonNode name = names.get(i); + for (int j = 0; j < names.size(); j++) { + JsonNode name = names.get(j); assertTrue(name.has("ForeName")); assertTrue(name.has("LastName")); } From f547d5598d9032222cec254aa62e0c6f1c1af1fb Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Tue, 1 Oct 2024 14:01:33 -0400 Subject: [PATCH 10/14] MLE-17142 Added docs for gzip streaming Also made use of "ZIP" and "gzip" consistent in the docs and the tests. And bumped the docker-compose file to use the official MarkLogic images. Otherwise, no functional changes here, just docs. --- docker-compose.yml | 4 +-- docs/export/export-archives.md | 14 +++++------ docs/export/export-documents.md | 25 ++++++++++--------- docs/export/export-rdf.md | 4 +-- docs/getting-started.md | 4 +-- docs/import/common-import-features.md | 2 +- docs/import/import-files/aggregate-xml.md | 2 +- docs/import/import-files/avro.md | 2 +- docs/import/import-files/delimited-text.md | 2 +- docs/import/import-files/generic-files.md | 15 ++++++----- docs/import/import-files/json.md | 2 +- docs/import/import-files/orc.md | 2 +- docs/import/import-files/parquet.md | 2 +- docs/import/import-files/rdf.md | 2 +- .../flux/impl/export/ExportFilesCommand.java | 4 +-- .../flux/api/GenericFilesExporterTest.java | 2 +- .../flux/impl/export/ExportFilesTest.java | 6 ++--- .../ImportAggregateJsonFilesTest.java | 8 +++--- 18 files changed, 52 insertions(+), 50 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index d06e000e..a1c7a94e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - name: flux services: @@ -17,7 +15,7 @@ services: - 8007:8007 marklogic: - image: "marklogicdb/marklogic-db:11.2.0-centos-1.1.2" + image: "progressofficial/marklogic-db:latest" platform: linux/amd64 environment: - MARKLOGIC_INIT=true diff --git a/docs/export/export-archives.md b/docs/export/export-archives.md index 00155441..37a3972b 100644 --- a/docs/export/export-archives.md +++ b/docs/export/export-archives.md @@ -57,21 +57,21 @@ combination of those options as well, with the exception that `--query` will be You must then use the `--path` option to specify a directory to write archive files to. -### Windows-specific issues with zip files +### Windows-specific issues with ZIP files -In the likely event that you have one or more URIs with a forward slash - `/` - in them, then creating a zip file +In the likely event that you have one or more URIs with a forward slash - `/` - in them, then creating a ZIP file with those URIs - which are used as the zip entry names - will produce confusing behavior on Windows. If you open the -zip file via Windows Explorer, Windows will erroneously think the zip file is empty. If you open the zip file using +ZIP file via Windows Explorer, Windows will erroneously think the file is empty. If you open the file using 7-Zip, you will see a top-level entry named `_` if one or more of your URIs begin with a forward slash. These are effectively issues that only occur when viewing the file within Windows and do not reflect the actual contents of the -zip file. The contents of the file are correct and if you were to import them with Flux via the `import-archive-files` +ZIP file. The contents of the file are correct and if you were to import them with Flux via the `import-archive-files` command, you will get the expected results. ## Controlling document metadata Each exported document will have all of its associated metadata - collections, permissions, quality, properties, and -metadata values - included in an XML document in the archive zip file. You can control which types of metadata are +metadata values - included in an XML document in the archive ZIP file. You can control which types of metadata are included with the `--categories` option. This option accepts a comma-delimited sequence of the following metadata types: - `collections` @@ -120,13 +120,13 @@ bin\flux export-archives ^ {% endtabs %} -The encoding will be used for both document and metadata entries in each archive zip file. +The encoding will be used for both document and metadata entries in each archive ZIP file. ## Exporting large binary files Similar to [exporting large binary documents as files](export-documents.md), you can include large binary documents in archives by including the `--streaming` option introduced in Flux 1.1.0. When this option is set, Flux will stream -each document from MarkLogic directly to a zip file, thereby avoiding reading the contents of a file into memory. +each document from MarkLogic directly to a ZIP file, thereby avoiding reading the contents of a file into memory. As streaming to an archive requires Flux to retrieve one document at a time from MarkLogic, you should not use this option when exporting smaller documents that can easily fit into the memory available to Flux. diff --git a/docs/export/export-documents.md b/docs/export/export-documents.md index d044274b..8cd165aa 100644 --- a/docs/export/export-documents.md +++ b/docs/export/export-documents.md @@ -157,22 +157,22 @@ To use the above transform, verify that your user has been granted the MarkLogic ## Compressing content -The `--compression` option is used to write files either to Gzip or ZIP files. +The `--compression` option is used to write files either to gzip or ZIP files. -To Gzip each file, include `--compression GZIP`. +To gzip each file, include `--compression GZIP`. -To write multiple files to one or more ZIP files, include `--compression ZIP`. A zip file will be created for each +To write multiple files to one or more ZIP files, include `--compression ZIP`. A ZIP file will be created for each partition that was created when reading data via Optic. You can include `--zip-file-count 1` to force all documents to be written to a single ZIP file. See the below section on "Understanding partitions" for more information. -### Windows-specific issues with zip files +### Windows-specific issues with ZIP files -In the likely event that you have one or more URIs with a forward slash - `/` - in them, then creating a zip file +In the likely event that you have one or more URIs with a forward slash - `/` - in them, then creating a ZIP file with those URIs - which are used as the zip entry names - will produce confusing behavior on Windows. If you open the -zip file via Windows Explorer, Windows will erroneously think the zip file is empty. If you open the zip file using +ZIP file via Windows Explorer, Windows will erroneously think the file is empty. If you open the file using 7-Zip, you will see a top-level entry named `_` if one or more of your URIs begin with a forward slash. These are effectively issues that only occur when viewing the file within Windows and do not reflect the actual contents of the -zip file. The contents of the file are correct and if you were to import them with Flux via the `import-files` +ZIP file. The contents of the file are correct and if you were to import them with Flux via the `import-files` command, you will get the expected results. ## Specifying an encoding @@ -207,7 +207,8 @@ bin\flux export-files ^ MarkLogic's [support for large binary documents](https://docs.marklogic.com/guide/app-dev/binaries#id_93203) allows for storing binary files of any size. To ensure that large binary documents can be exported to a file path, consider using the `--streaming` option introduced in Flux 1.1.0. When this option is set, Flux will stream each document -from MarkLogic directly to the file path, thereby avoiding reading the contents of a file into memory. +from MarkLogic directly to the file path, thereby avoiding reading the contents of a file into memory. This option +can be used when exporting documents to gzip or ZIP files as well via the `--compression zip` option. As streaming to a file requires Flux to retrieve one document at a time from MarkLogic, you should not use this option when exporting smaller documents that can easily fit into the memory available to Flux. @@ -257,9 +258,9 @@ bin\flux export-files ^ {% endtab %} {% endtabs %} -The `./export` directory will have 12 zip files in it. This count is due to how Flux reads data from MarkLogic, +The `./export` directory will have 12 ZIP files in it. This count is due to how Flux reads data from MarkLogic, which involves creating 4 partitions by default per forest in the MarkLogic database. The example application has 3 -forests in its content database, and thus 12 partitions are created, resulting in 12 separate zip files. +forests in its content database, and thus 12 partitions are created, resulting in 12 separate ZIP files. You can use the `--partitions-per-forest` option to control how many partitions - and thus workers - read documents from each forest in your database: @@ -292,7 +293,7 @@ bin\flux export-files ^ {% endtabs %} -This approach will produce 3 zip files - one per forest. +This approach will produce 3 ZIP files - one per forest. You can also use the `--repartition` option, available on every command, to force the number of partitions used when writing data, regardless of how many were used to read the data: @@ -323,7 +324,7 @@ bin\flux export-files ^ {% endtabs %} -This approach will produce a single zip file due to the use of a single partition when writing files. +This approach will produce a single ZIP file due to the use of a single partition when writing files. The `--zip-file-count` option is effectively an alias for `--repartition`. Both options produce the same outcome. `--zip-file-count` is included as a more intuitive option for the common case of configuring how many files should be written. diff --git a/docs/export/export-rdf.md b/docs/export/export-rdf.md index 32e23e87..ccf34915 100644 --- a/docs/export/export-rdf.md +++ b/docs/export/export-rdf.md @@ -86,6 +86,6 @@ For some use cases involving exporting triples with their graphs to files contai reference the graph that each triple belongs to in MarkLogic. You can use `--graph-override` to specify an alternative graph value that will then be associated with every triple that Flux writes to a file. -## GZIP compression +## gzip compression -To compress each file written by Flux using GZIP, simply include `--gzip` as an option. +To compress each file written by Flux using gzip, simply include `--gzip` as an option. diff --git a/docs/getting-started.md b/docs/getting-started.md index 02fb7f6b..d96caabf 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -26,7 +26,7 @@ application that can be deployed to your own instance of MarkLogic server. The a `marklogic-flux-getting-started-1.0.0.zip`. To use Flux with this example application, perform the following steps: 1. Extract the `marklogic-flux-getting-started-1.0.0.zip` file to any location on your local filesystem. -2. Run `cd marklogic-flux-getting-started-1.0.0` to change to the directory created by extracting the zip file. +2. Run `cd marklogic-flux-getting-started-1.0.0` to change to the directory created by extracting the ZIP file. 3. Create a file named `gradle-local.properties` and add `mlPassword=your MarkLogic admin user password` to it. 4. Examine the contents of the `gradle.properties` file to ensure that the value of `mlHost` points to your MarkLogic server and that the value of `mlRestPort` is a port available for a new MarkLogic app server to use. @@ -192,7 +192,7 @@ thus producing hierarchical documents with nested data structures. Flux supports several commands for exporting data from MarkLogic, either as documents or rows, to a variety of destinations. Commands that export documents support a variety of queries, while commands that export rows use the [MarkLogic Optic API](https://docs.marklogic.com/guide/app-dev/OpticAPI) -to select rows. The following shows an example of exporting the 1000 employee documents to a single zip file: +to select rows. The following shows an example of exporting the 1000 employee documents to a single ZIP file: {% tabs log %} {% tab log Unix %} diff --git a/docs/import/common-import-features.md b/docs/import/common-import-features.md index 326fc523..977a4393 100644 --- a/docs/import/common-import-features.md +++ b/docs/import/common-import-features.md @@ -45,7 +45,7 @@ reads from its associated data source. This option is supported for the followin - `import-aggregate-json-files` - `import-avro-files` - `import-delimited-files` -- `import-files`, but only for JSON files and JSON entries in zip files. +- `import-files`, but only for JSON files and JSON entries in ZIP files. - `import-jdbc` - `import-orc-files` - `import-parquet-files` diff --git a/docs/import/import-files/aggregate-xml.md b/docs/import/import-files/aggregate-xml.md index f72c66ff..ae3f504f 100644 --- a/docs/import/import-files/aggregate-xml.md +++ b/docs/import/import-files/aggregate-xml.md @@ -107,7 +107,7 @@ example, adding the following would result in URIs of `/person/1.xml` and `/pers ## Compressed XML files -Flux supports Gzip and ZIP aggregate XML files. Simply include the `--compression` option with a value of `GZIP` or +Flux supports gzip and ZIP aggregate XML files. Simply include the `--compression` option with a value of `GZIP` or `ZIP`. ## Specifying an encoding diff --git a/docs/import/import-files/avro.md b/docs/import/import-files/avro.md index 0ae7be6f..201e6990 100644 --- a/docs/import/import-files/avro.md +++ b/docs/import/import-files/avro.md @@ -117,7 +117,7 @@ The `import-avro-files` command supports aggregating related rows together to pr ## Reading compressed files -Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +Flux will automatically read files compressed with gzip when they have a filename ending in `.gz`; you do not need to specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. diff --git a/docs/import/import-files/delimited-text.md b/docs/import/import-files/delimited-text.md index 31b50c53..bf6aaf78 100644 --- a/docs/import/import-files/delimited-text.md +++ b/docs/import/import-files/delimited-text.md @@ -166,7 +166,7 @@ The `import-delimited-files` command supports aggregating related rows together ## Reading compressed files -Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +Flux will automatically read files compressed with gzip when they have a filename ending in `.gz`; you do not need to specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. diff --git a/docs/import/import-files/generic-files.md b/docs/import/import-files/generic-files.md index a2235918..e1879bbb 100644 --- a/docs/import/import-files/generic-files.md +++ b/docs/import/import-files/generic-files.md @@ -110,15 +110,18 @@ will result in a URI of `/my%20file.json`. This is due to an [issue in the MarkLogic REST API endpoint](https://docs.marklogic.com/REST/PUT/v1/documents) that will be resolved in a future server release. -## Importing Gzip files +## Importing gzip files + +To import gzip files with each file being decompressed before written to MarkLogic, include the `--compression` option +with a value of `GZIP`. You can also import gzip files as-is - i.e. without decompressing them - by not including the +`--compression` option. The `--streaming` option introduced in Flux 1.1.0 can also be used for very large gzip files +that may not fit into the memory available to Flux or to MarkLogic. -To import Gzip files with each file being decompressed before written to MarkLogic, include the `--compression` option -with a value of `GZIP`. You can also import Gzip files as-is - i.e. without decompressing them - by not including the -`--compression` option. - ## Importing ZIP files To import each entry in a ZIP file as a separate document, include the `--compression` option with a value of `ZIP`. Each document will have an initial URI based on both the absolute path of the ZIP file and the name of the ZIP entry. You can also use the `--document-type` option as described above to force a document type for any entry that has a file -extension not recognized by MarkLogic. +extension not recognized by MarkLogic. The `--streaming` option introduced in Flux 1.1.0 can also be used for ZIP files +containing very large binary files that may not fit into the memory available to Flux or to MarkLogic. + diff --git a/docs/import/import-files/json.md b/docs/import/import-files/json.md index a118ae4c..da9618e1 100644 --- a/docs/import/import-files/json.md +++ b/docs/import/import-files/json.md @@ -128,7 +128,7 @@ bin\flux import-aggregate-json-files ^ ## Reading compressed files -Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +Flux will automatically read files compressed with gzip when they have a filename ending in `.gz`; you do not need to specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. diff --git a/docs/import/import-files/orc.md b/docs/import/import-files/orc.md index 5251fb7a..e0fe08a8 100644 --- a/docs/import/import-files/orc.md +++ b/docs/import/import-files/orc.md @@ -117,7 +117,7 @@ The `import-orc-files` command supports aggregating related rows together to pro ## Reading compressed files -Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +Flux will automatically read files compressed with gzip when they have a filename ending in `.gz`; you do not need to specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. diff --git a/docs/import/import-files/parquet.md b/docs/import/import-files/parquet.md index 622d4008..3396e25f 100644 --- a/docs/import/import-files/parquet.md +++ b/docs/import/import-files/parquet.md @@ -117,7 +117,7 @@ The `import-parquet-files` command supports aggregating related rows together to ## Reading compressed files -Flux will automatically read files compressed with GZIP when they have a filename ending in `.gz`; you do not need to +Flux will automatically read files compressed with gzip when they have a filename ending in `.gz`; you do not need to specify a compression option. As noted in the "Advanced options" section below, you can use `-Pcompression=` to explicitly specify a compression algorithm if Flux is not able to read your compressed files automatically. diff --git a/docs/import/import-files/rdf.md b/docs/import/import-files/rdf.md index ab2de6f6..8c2b084b 100644 --- a/docs/import/import-files/rdf.md +++ b/docs/import/import-files/rdf.md @@ -78,4 +78,4 @@ are free to specify as many collections as you want in addition to the graph you ## Compressed files -Flux supports Gzip and ZIP RDF files. Simply include the `--compression` option with a value of `GZIP` or `ZIP`. +Flux supports gzip and ZIP RDF files. Simply include the `--compression` option with a value of `GZIP` or `ZIP`. diff --git a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportFilesCommand.java b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportFilesCommand.java index bb9992e7..fc978c84 100644 --- a/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportFilesCommand.java +++ b/flux-cli/src/main/java/com/marklogic/flux/impl/export/ExportFilesCommand.java @@ -97,8 +97,8 @@ public static class WriteGenericFilesParams implements Supplier rows = newSparkSession().read().format("marklogic") .option(Options.READ_FILES_COMPRESSION, "zip") .load(dir.getAbsolutePath()) @@ -128,7 +128,7 @@ void exportToZipsWithRepartition(@TempDir Path tempDir) { ); File dir = tempDir.toFile(); - assertEquals(5, dir.listFiles().length, "Should have 5 zip files instead of 3 due to the use of --zip-file-count."); + assertEquals(5, dir.listFiles().length, "Should have 5 ZIP files instead of 3 due to the use of --zip-file-count."); } @Test diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java index dcbd0347..144de5c3 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportAggregateJsonFilesTest.java @@ -183,12 +183,12 @@ void jsonLinesWithCustomDelimiter() { } /** - * This documents that if a user has a zip file, and we would normally use a Spark data source to read the type files - * inside the zip, we unfortunately cannot do that as Spark doesn't have any support for zip files. Databricks + * This documents that if a user has a ZIP file, and we would normally use a Spark data source to read the type files + * inside the ZIP, we unfortunately cannot do that as Spark doesn't have any support for ZIP files. Databricks * documentation - https://docs.databricks.com/en/files/unzip-files.html - confirms this, noting that if a user has - * a zip file, they should first expand it. + * a ZIP file, they should first expand it. *

- * So for zip files, the best we can do is use our own reader, which is limited to reading each file as a "file row" + * So for ZIP files, the best we can do is use our own reader, which is limited to reading each file as a "file row" * and then writing it as a document to MarkLogic. Which means that a user cannot use a feature like * "--uri-template", as that depends on having values in columns that can be referenced by the template. We will * hopefully be enhancing this in a future story - specifically, by enhancing the URI template feature to work on From c1bef803806a47d267bd83ea50635668e7e9ff55 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Tue, 1 Oct 2024 15:07:18 -0400 Subject: [PATCH 11/14] Bumped connector to 2.4-SNAPSHOT --- flux-cli/build.gradle | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flux-cli/build.gradle b/flux-cli/build.gradle index 94a22ecc..095ff60b 100644 --- a/flux-cli/build.gradle +++ b/flux-cli/build.gradle @@ -17,7 +17,7 @@ dependencies { // The rocksdbjni dependency weighs in at 50mb and so far does not appear necessary for our use of Spark. exclude module: "rocksdbjni" } - implementation "com.marklogic:marklogic-spark-connector:2.3-SNAPSHOT" + implementation "com.marklogic:marklogic-spark-connector:2.4-SNAPSHOT" implementation "info.picocli:picocli:4.7.6" // Spark 3.4.3 depends on Hadoop 3.3.4, which depends on AWS SDK 1.12.262. As of August 2024, all public releases of @@ -75,7 +75,7 @@ dependencies { // Forcing HttpClient to use this to address https://snyk.io/vuln/SNYK-JAVA-COMMONSCODEC-561518 . testImplementation 'commons-codec:commons-codec:1.17.1' - shadowDependencies "com.marklogic:marklogic-spark-connector:2.3-SNAPSHOT" + shadowDependencies "com.marklogic:marklogic-spark-connector:2.4-SNAPSHOT" shadowDependencies "info.picocli:picocli:4.7.6" } From 1c69c0d30d741e7f822dc81710390696797a45b9 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Tue, 1 Oct 2024 15:11:44 -0400 Subject: [PATCH 12/14] Tweak to getting-started zip --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 7009e1ca..e2ce7c34 100644 --- a/build.gradle +++ b/build.gradle @@ -29,7 +29,7 @@ task gettingStartedZip(type: Zip) { description = "Creates a zip of the getting-started project that is intended to be included as a downloadable file " + "on the GitHub release page." from "examples/getting-started" - exclude "build", ".gradle", "gradle-*.properties", "flux", ".gitignore" + exclude "build", ".gradle", "gradle-*.properties", "flux", ".gitignore", "marklogic-flux" into "marklogic-flux-getting-started-${version}" archiveFileName = "marklogic-flux-getting-started-${version}.zip" destinationDirectory = file("build") From b595e3721e664ffd3cc5707ec6fef6df9eb2b789 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Wed, 2 Oct 2024 10:27:59 -0400 Subject: [PATCH 13/14] Bumped to Spark 3.5.3 All tests passing in both the connector project and Flux repo. Verified manually with spark-submit too. --- CONTRIBUTING.md | 6 +++--- flux-cli/build.gradle | 4 ++-- .../flux/impl/importdata/ImportDelimitedFilesTest.java | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3804eb0c..223b8613 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -228,7 +228,7 @@ Set `SPARK_HOME` to the location of Spark - e.g. `/Users/myname/.sdkman/candidat Next, start a Spark master node: - cd $SPARK_HOME/bin + cd $SPARK_HOME/sbin start-master.sh You will need the address at which the Spark master node can be reached. To find it, open the log file that Spark @@ -264,7 +264,7 @@ cluster: ``` $SPARK_HOME/bin/spark-submit --class com.marklogic.flux.spark.Submit \ ---master spark://NYWHYC3G0W:7077 flux-cli/build/libs/marklogic-flux-1.0.0-all.jar \ +--master spark://NYWHYC3G0W:7077 flux-cli/build/libs/marklogic-flux-1.1.0-SNAPSHOT-all.jar \ import-files --path /Users/rudin/workspace/flux/flux-cli/src/test/resources/mixed-files \ --connection-string "admin:admin@localhost:8000" \ --preview 5 --preview-drop content @@ -281,7 +281,7 @@ to something you can access): $SPARK_HOME/bin/spark-submit --class com.marklogic.flux.spark.Submit \ --packages org.apache.hadoop:hadoop-aws:3.3.4,org.apache.hadoop:hadoop-client:3.3.4 \ --master spark://NYWHYC3G0W:7077 \ -flux-cli/build/libs/marklogic-flux-1.0.0-all.jar \ +flux-cli/build/libs/marklogic-flux-1.1.0-SNAPSHOT-all.jar \ import-files --path "s3a://changeme/" \ --connection-string "admin:admin@localhost:8000" \ --s3-add-credentials \ diff --git a/flux-cli/build.gradle b/flux-cli/build.gradle index 095ff60b..27ae0d57 100644 --- a/flux-cli/build.gradle +++ b/flux-cli/build.gradle @@ -13,7 +13,7 @@ configurations { } dependencies { - implementation("org.apache.spark:spark-sql_2.12:3.4.3") { + implementation("org.apache.spark:spark-sql_2.12:3.5.3") { // The rocksdbjni dependency weighs in at 50mb and so far does not appear necessary for our use of Spark. exclude module: "rocksdbjni" } @@ -44,7 +44,7 @@ dependencies { implementation "org.apache.hadoop:hadoop-client:3.3.4" // Spark doesn't include Avro support by default, so need to bring this in. - implementation "org.apache.spark:spark-avro_2.12:3.4.3" + implementation "org.apache.spark:spark-avro_2.12:3.5.3" testImplementation("com.marklogic:marklogic-junit5:1.4.0") { // Excluding Jackson so that we use whatever Jackson version is required by Spark. diff --git a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportDelimitedFilesTest.java b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportDelimitedFilesTest.java index da251b04..fe217d59 100644 --- a/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportDelimitedFilesTest.java +++ b/flux-cli/src/test/java/com/marklogic/flux/impl/importdata/ImportDelimitedFilesTest.java @@ -241,9 +241,9 @@ void abortOnReadFailure() { )); assertCollectionSize("delimited-test", 0); - assertTrue(stderr.contains("Command failed, cause: [MALFORMED_RECORD_IN_PARSING]"), "The command should " + + assertTrue(stderr.contains("Command failed, cause: [MALFORMED_RECORD_IN_PARSING"), "The command should " + "have failed due to --abort-on-read-failure being included. This should result in the 'mode' option being " + - "set to FAILFAST."); + "set to FAILFAST. Actual stderr: " + stderr); } @Test From 6378608babf7f420e0a82660acf85ad9200cdb89 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Wed, 2 Oct 2024 15:37:35 -0400 Subject: [PATCH 14/14] Bumped to 1.1.0 Bumped connector dependency to 2.4.0. Fixed a few issues in the NOTICE file. --- CONTRIBUTING.md | 6 +++--- NOTICE.txt | 24 ++++++++++++------------ build.gradle | 8 ++++---- docs/api.md | 8 ++++---- docs/getting-started.md | 16 ++++++++-------- docs/spark-integration.md | 10 +++++----- examples/client-project/build.gradle | 4 ++-- flux-cli/build.gradle | 4 ++-- gradle.properties | 2 +- 9 files changed, 41 insertions(+), 41 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 223b8613..7f3f75e1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -256,7 +256,7 @@ are all synonyms): ./gradlew shadowJar -This will produce an assembly jar at `./flux-cli/build/libs/marklogic-flux-1.0.0-all.jar`. +This will produce an assembly jar at `./flux-cli/build/libs/marklogic-flux-1.1.0-all.jar`. You can now run any CLI command via spark-submit. This is an example of previewing an import of files - change the value of `--path`, as an absolute path is needed, and of course change the value of `--master` to match that of your Spark @@ -264,7 +264,7 @@ cluster: ``` $SPARK_HOME/bin/spark-submit --class com.marklogic.flux.spark.Submit \ ---master spark://NYWHYC3G0W:7077 flux-cli/build/libs/marklogic-flux-1.1.0-SNAPSHOT-all.jar \ +--master spark://NYWHYC3G0W:7077 flux-cli/build/libs/marklogic-flux-1.1.0-all.jar \ import-files --path /Users/rudin/workspace/flux/flux-cli/src/test/resources/mixed-files \ --connection-string "admin:admin@localhost:8000" \ --preview 5 --preview-drop content @@ -281,7 +281,7 @@ to something you can access): $SPARK_HOME/bin/spark-submit --class com.marklogic.flux.spark.Submit \ --packages org.apache.hadoop:hadoop-aws:3.3.4,org.apache.hadoop:hadoop-client:3.3.4 \ --master spark://NYWHYC3G0W:7077 \ -flux-cli/build/libs/marklogic-flux-1.1.0-SNAPSHOT-all.jar \ +flux-cli/build/libs/marklogic-flux-1.1.0-all.jar \ import-files --path "s3a://changeme/" \ --connection-string "admin:admin@localhost:8000" \ --s3-add-credentials \ diff --git a/NOTICE.txt b/NOTICE.txt index 9d60916c..231e915a 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -6,13 +6,13 @@ To the extent required by the applicable open-source license, a complete machine Third Party Notices -aws-java-sdk-s3 1.12.367 (Apache-2.0) -hadoop-aws 3.3.6 (Apache-2.0) -hadoop-client 3.3.6 (Apache-2.0) -marklogic-spark-connector 2.3.0 (Apache-2.0) +aws-java-sdk-s3 1.12.262 (Apache-2.0) +hadoop-aws 3.3.4 (Apache-2.0) +hadoop-client 3.3.4 (Apache-2.0) +marklogic-spark-connector 2.4.0 (Apache-2.0) picocli 4.7.6 (Apache-2.0) -spark-avro_2.12 3.4.3 (Apache-2.0) -spark-sql_2.12 3.4.3 (Apache-2.0) +spark-avro_2.12 3.5.3 (Apache-2.0) +spark-sql_2.12 3.5.3 (Apache-2.0) Common Licenses @@ -22,20 +22,20 @@ Third-Party Components The following is a list of the third-party components used by MarkLogic® Flux™ v1 (last updated July 2, 2024): -aws-java-sdk-s3 1.12.367 (Apache-2.0) +aws-java-sdk-s3 1.12.262 (Apache-2.0) https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3 For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0) -hadoop-aws 3.3.6 (Apache-2.0) +hadoop-aws 3.3.4 (Apache-2.0) https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0) -hadoop-client 3.3.6 (Apache-2.0) +hadoop-client 3.3.4 (Apache-2.0) https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0) -marklogic-spark-connector 2.3 (Apache-2.0) +marklogic-spark-connector 2.34.0(Apache-2.0) https://repo1.maven.org/maven2/com/marklogic/marklogic-spark-connector For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0) @@ -43,11 +43,11 @@ picocli 4.7.6 (Apache-2.0) https://repo1.maven.org/maven2/info/picocli/picocli For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0) -spark-avro_2.12 3.4.3 (Apache-2.0) +spark-avro_2.12 3.5.3 (Apache-2.0) https://repo1.maven.org/maven2/org/apache/spark/spark-avro_2.12 For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0) -spark-sql_2.12 3.4.3 (Apache-2.0) +spark-sql_2.12 3.5.3 (Apache-2.0) https://repo1.maven.org/maven2/org/apache/spark/spark-sql_2.12 For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0) diff --git a/build.gradle b/build.gradle index e2ce7c34..ec4fa3c6 100644 --- a/build.gradle +++ b/build.gradle @@ -10,10 +10,10 @@ subprojects { repositories { mavenCentral() - mavenLocal() - maven { - url "https://bed-artifactory.bedford.progress.com:443/artifactory/ml-maven-snapshots/" - } +// mavenLocal() +// maven { +// url "https://bed-artifactory.bedford.progress.com:443/artifactory/ml-maven-snapshots/" +// } } test { diff --git a/docs/api.md b/docs/api.md index b1b5189a..f953eb7b 100644 --- a/docs/api.md +++ b/docs/api.md @@ -22,7 +22,7 @@ To add Flux as a dependency to your application, add the following to your Maven com.marklogic flux-api - 1.0.0 + 1.1.0 ``` @@ -30,7 +30,7 @@ Or if you are using Gradle, add the following to your `build.gradle` file: ``` dependencies { - implementation "com.marklogic:flux-api:1.0.0" + implementation "com.marklogic:flux-api:1.1.0" } ``` @@ -97,7 +97,7 @@ buildscript { mavenCentral() } dependencies { - classpath "com.marklogic:flux-api:1.0.0" + classpath "com.marklogic:flux-api:1.1.0" } } ``` @@ -139,7 +139,7 @@ buildscript { mavenCentral() } dependencies { - classpath "com.marklogic:flux-api:1.0.0" + classpath "com.marklogic:flux-api:1.1.0" classpath("com.marklogic:ml-gradle:4.8.0") { exclude group: "com.fasterxml.jackson.databind" exclude group: "com.fasterxml.jackson.core" diff --git a/docs/getting-started.md b/docs/getting-started.md index d96caabf..4e21e2a8 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -15,7 +15,7 @@ This guide describes how to get started with Flux with some examples demonstrati ## Setup You can download the latest release of the Flux application zip from [the latest Flux release page](https://github.com/marklogic/flux/releases). -The Flux application zip is titled `marklogic-flux-1.0.0.zip`. You can extract this zip to any location on your +The Flux application zip is titled `marklogic-flux-1.1.0.zip`. You can extract this zip to any location on your filesystem that you prefer. ### Deploying the example application @@ -23,10 +23,10 @@ filesystem that you prefer. The examples in this guide, along with examples found throughout this documentation, depend on a small MarkLogic application that can be deployed to your own instance of MarkLogic server. The application can be downloaded from [the latest Flux release page](https://github.com/marklogic/flux/releases) in a zip titled -`marklogic-flux-getting-started-1.0.0.zip`. To use Flux with this example application, perform the following steps: +`marklogic-flux-getting-started-1.1.0.zip`. To use Flux with this example application, perform the following steps: -1. Extract the `marklogic-flux-getting-started-1.0.0.zip` file to any location on your local filesystem. -2. Run `cd marklogic-flux-getting-started-1.0.0` to change to the directory created by extracting the ZIP file. +1. Extract the `marklogic-flux-getting-started-1.1.0.zip` file to any location on your local filesystem. +2. Run `cd marklogic-flux-getting-started-1.1.0` to change to the directory created by extracting the ZIP file. 3. Create a file named `gradle-local.properties` and add `mlPassword=your MarkLogic admin user password` to it. 4. Examine the contents of the `gradle.properties` file to ensure that the value of `mlHost` points to your MarkLogic server and that the value of `mlRestPort` is a port available for a new MarkLogic app server to use. @@ -38,15 +38,15 @@ privileges for running the examples in this guide. Finally, the application incl [MarkLogic TDE template](https://docs.marklogic.com/guide/app-dev/TDE) that creates a view in MarkLogic for the purpose of demonstrating commands that utilize a [MarkLogic Optic query](https://docs.marklogic.com/guide/app-dev/OpticAPI). -It is recommended to extract the Flux application zip into the `marklogic-flux-getting-started-1.0.0` directory so that +It is recommended to extract the Flux application zip into the `marklogic-flux-getting-started-1.1.0` directory so that you can easily execute the examples in this guide. After extracting the application zip, the directory should have a structure similar to this (not all files may be shown): ``` -./marklogic-flux-getting-started-1.0.0 +./marklogic-flux-getting-started-1.1.0 build.gradle ./data - ./marklogic-flux-1.0.0 + ./marklogic-flux-1.1.0 ./gradle gradle.properties gradlew @@ -59,7 +59,7 @@ structure similar to this (not all files may be shown): You can run Flux without any options to see the list of available commands. If you are using Flux to run these examples, first change your current directory to where you extract Flux: - cd marklogic-flux-1.0.0 + cd marklogic-flux-1.1.0 And then run the Flux executable without any options: diff --git a/docs/spark-integration.md b/docs/spark-integration.md index cec6584c..731c2799 100644 --- a/docs/spark-integration.md +++ b/docs/spark-integration.md @@ -15,7 +15,7 @@ require more system resources than what are available when running Flux as a com ## Spark security notice -As of August 2024 and the Flux 1.0.0 release, all public releases of Apache Spark 3.4.x through 3.5.1 depend on +As of October 2024 and the Flux 1.1.0 release, all public releases of Apache Spark 3.4.x through 3.5.3 depend on Apache Hadoop 3.3.4. This version of Hadoop has a [CVE filed against it](https://nvd.nist.gov/vuln/detail/CVE-2023-26031). The CVE involves Spark running with a YARN cluster manager and the YARN cluster "is accepting work from remote (authenticated) users". @@ -35,8 +35,8 @@ Flux integrates with [spark-submit](https://spark.apache.org/docs/latest/submitt submit a Flux command invocation to a remote Spark cluster. Every Flux command is a Spark application, and thus every Flux command, along with all of its option, can be invoked via `spark-submit`. -To use Flux with `spark-submit`, first download the `marklogic-flux-1.0.0-all.jar` file from the -[GitHub release page](https://github.com/marklogic/flux/releases/tag/1.0.0). This jar file includes Flux and all of +To use Flux with `spark-submit`, first download the `marklogic-flux-1.1.0-all.jar` file from the +[GitHub release page](https://github.com/marklogic/flux/releases/tag/1.1.0). This jar file includes Flux and all of its dependencies, excluding those of Spark itself, which will be provided via the Spark cluster that you connect to via `spark-submit`. @@ -48,7 +48,7 @@ The following shows a notional example of running the Flux `import-files` comman ``` $SPARK_HOME/bin/spark-submit --class com.marklogic.flux.spark.Submit \ --master spark://changeme:7077 \ - marklogic-flux-1.0.0-all.jar \ + marklogic-flux-1.1.0-all.jar \ import-files \ --path path/to/data \ --connection-string user:password@host:8000 \ @@ -59,7 +59,7 @@ $SPARK_HOME/bin/spark-submit --class com.marklogic.flux.spark.Submit \ ``` $SPARK_HOME\bin\spark-submit --class com.marklogic.flux.spark.Submit ^ --master spark://changeme:7077 ^ - marklogic-flux-1.0.0-all.jar ^ + marklogic-flux-1.1.0-all.jar ^ import-files ^ --path path/to/data ^ --connection-string user:password@host:8000 ^ diff --git a/examples/client-project/build.gradle b/examples/client-project/build.gradle index 75c97e5c..5a979f94 100644 --- a/examples/client-project/build.gradle +++ b/examples/client-project/build.gradle @@ -6,7 +6,7 @@ buildscript { mavenLocal() } dependencies { - classpath "com.marklogic:flux-api:1.0.0" + classpath "com.marklogic:flux-api:1.1.0" // Demonstrates removing the Jackson libraries that otherwise cause a conflict with // Spark, which requires Jackson >= 2.14.0 and < 2.15.0. @@ -28,7 +28,7 @@ repositories { } dependencies { - implementation "com.marklogic:flux-api:1.0.0" + implementation "com.marklogic:flux-api:1.1.0" } tasks.register("runApp", JavaExec) { diff --git a/flux-cli/build.gradle b/flux-cli/build.gradle index 27ae0d57..800bdd5c 100644 --- a/flux-cli/build.gradle +++ b/flux-cli/build.gradle @@ -17,7 +17,7 @@ dependencies { // The rocksdbjni dependency weighs in at 50mb and so far does not appear necessary for our use of Spark. exclude module: "rocksdbjni" } - implementation "com.marklogic:marklogic-spark-connector:2.4-SNAPSHOT" + implementation "com.marklogic:marklogic-spark-connector:2.4.0" implementation "info.picocli:picocli:4.7.6" // Spark 3.4.3 depends on Hadoop 3.3.4, which depends on AWS SDK 1.12.262. As of August 2024, all public releases of @@ -75,7 +75,7 @@ dependencies { // Forcing HttpClient to use this to address https://snyk.io/vuln/SNYK-JAVA-COMMONSCODEC-561518 . testImplementation 'commons-codec:commons-codec:1.17.1' - shadowDependencies "com.marklogic:marklogic-spark-connector:2.4-SNAPSHOT" + shadowDependencies "com.marklogic:marklogic-spark-connector:2.4.0" shadowDependencies "info.picocli:picocli:4.7.6" } diff --git a/gradle.properties b/gradle.properties index 0b4dfaf7..5dea9f33 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,4 +1,4 @@ -version=1.1.0-SNAPSHOT +version=1.1.0 # Define these on the command line to publish to OSSRH # See https://central.sonatype.org/publish/publish-gradle/#credentials for more information