Redefine Drop Index as logical delete (#2386) (#2397)

* Redefine Drop Index as logical delete * merge from 2.x * add refresh_job limit and disable batch query * update doc --------- (cherry picked from commit cb8d953) Signed-off-by: Peng Huo <[email protected]> Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
opensearch-project · Oct 30, 2023 · e939bb6 · e939bb6
1 parent deb3ccf
commit e939bb6
Show file tree

Hide file tree

Showing 45 changed files with 2,269 additions and 769 deletions.
diff --git a/common/src/main/java/org/opensearch/sql/common/setting/Settings.java b/common/src/main/java/org/opensearch/sql/common/setting/Settings.java
@@ -5,8 +5,6 @@
 
 package org.opensearch.sql.common.setting;
 
-import static org.opensearch.sql.common.setting.Settings.Key.SPARK_EXECUTION_SESSION_ENABLED;
-
 import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableMap;
 import java.util.List;
@@ -40,8 +38,8 @@ public enum Key {
     METRICS_ROLLING_INTERVAL("plugins.query.metrics.rolling_interval"),
     SPARK_EXECUTION_ENGINE_CONFIG("plugins.query.executionengine.spark.config"),
     CLUSTER_NAME("cluster.name"),
-    SPARK_EXECUTION_SESSION_ENABLED("plugins.query.executionengine.spark.session.enabled"),
     SPARK_EXECUTION_SESSION_LIMIT("plugins.query.executionengine.spark.session.limit"),
+    SPARK_EXECUTION_REFRESH_JOB_LIMIT("plugins.query.executionengine.spark.refresh_job.limit"),
     SESSION_INDEX_TTL("plugins.query.executionengine.spark.session.index.ttl"),
     RESULT_INDEX_TTL("plugins.query.executionengine.spark.result.index.ttl"),
     AUTO_INDEX_MANAGEMENT_ENABLED(
@@ -69,9 +67,4 @@ public static Optional<Key> of(String keyValue) {
   public abstract <T> T getSettingValue(Key key);
 
   public abstract List<?> getSettings();
-
-  /** Helper class */
-  public static boolean isSparkExecutionSessionEnabled(Settings settings) {
-    return settings.getSettingValue(SPARK_EXECUTION_SESSION_ENABLED);
-  }
 }
diff --git a/docs/user/admin/settings.rst b/docs/user/admin/settings.rst
@@ -311,15 +311,16 @@ SQL query::
       "status": 400
     }
 
-plugins.query.executionengine.spark.session.enabled
-===================================================
+
+plugins.query.executionengine.spark.session.limit
+==================================================
 
 Description
 -----------
 
-By default, execution engine is executed in session mode. You can disable session mode by this setting.
+Each cluster can have maximum 100 sessions running in parallel by default. You can increase limit by this setting.
 
-1. The default value is true.
+1. The default value is 100.
 2. This setting is node scope.
 3. This setting can be updated dynamically.
 
@@ -328,7 +329,7 @@ You can update the setting with a new value like this.
 SQL query::
 
     sh$ curl -sS -H 'Content-Type: application/json' -X PUT localhost:9200/_plugins/_query/settings \
-    ... -d '{"transient":{"plugins.query.executionengine.spark.session.enabled":"false"}}'
+    ... -d '{"transient":{"plugins.query.executionengine.spark.session.limit":200}}'
     {
       "acknowledged": true,
       "persistent": {},
@@ -338,7 +339,7 @@ SQL query::
             "executionengine": {
               "spark": {
                 "session": {
-                  "enabled": "false"
+                  "limit": "200"
                 }
               }
             }
@@ -347,15 +348,16 @@ SQL query::
       }
     }
 
-plugins.query.executionengine.spark.session.limit
-==================================================
+
+plugins.query.executionengine.spark.refresh_job.limit
+=====================================================
 
 Description
 -----------
 
-Each cluster can have maximum 100 sessions running in parallel by default. You can increase limit by this setting.
+Each cluster can have maximum 20 datasources. You can increase limit by this setting.
 
-1. The default value is 100.
+1. The default value is 20.
 2. This setting is node scope.
 3. This setting can be updated dynamically.
 
@@ -364,7 +366,7 @@ You can update the setting with a new value like this.
 SQL query::
 
     sh$ curl -sS -H 'Content-Type: application/json' -X PUT localhost:9200/_plugins/_query/settings \
-    ... -d '{"transient":{"plugins.query.executionengine.spark.session.limit":200}}'
+    ... -d '{"transient":{"plugins.query.executionengine.spark.refresh_job.limit":200}}'
     {
       "acknowledged": true,
       "persistent": {},
@@ -373,7 +375,7 @@ SQL query::
           "query": {
             "executionengine": {
               "spark": {
-                "session": {
+                "refresh_job": {
                   "limit": "200"
                 }
               }

diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/setting/OpenSearchSettings.java
@@ -137,20 +137,20 @@ public class OpenSearchSettings extends Settings {
           Setting.Property.NodeScope,
           Setting.Property.Dynamic);
 
-  public static final Setting<?> SPARK_EXECUTION_SESSION_ENABLED_SETTING =
-      Setting.boolSetting(
-          Key.SPARK_EXECUTION_SESSION_ENABLED.getKeyValue(),
-          true,
-          Setting.Property.NodeScope,
-          Setting.Property.Dynamic);
-
   public static final Setting<?> SPARK_EXECUTION_SESSION_LIMIT_SETTING =
       Setting.intSetting(
           Key.SPARK_EXECUTION_SESSION_LIMIT.getKeyValue(),
           100,
           Setting.Property.NodeScope,
           Setting.Property.Dynamic);
 
+  public static final Setting<?> SPARK_EXECUTION_REFRESH_JOB_LIMIT_SETTING =
+      Setting.intSetting(
+          Key.SPARK_EXECUTION_REFRESH_JOB_LIMIT.getKeyValue(),
+          50,
+          Setting.Property.NodeScope,
+          Setting.Property.Dynamic);
+
   public static final Setting<TimeValue> SESSION_INDEX_TTL_SETTING =
       Setting.positiveTimeSetting(
           Key.SESSION_INDEX_TTL.getKeyValue(),
@@ -249,18 +249,18 @@ public OpenSearchSettings(ClusterSettings clusterSettings) {
         Key.SPARK_EXECUTION_ENGINE_CONFIG,
         SPARK_EXECUTION_ENGINE_CONFIG,
         new Updater(Key.SPARK_EXECUTION_ENGINE_CONFIG));
-    register(
-        settingBuilder,
-        clusterSettings,
-        Key.SPARK_EXECUTION_SESSION_ENABLED,
-        SPARK_EXECUTION_SESSION_ENABLED_SETTING,
-        new Updater(Key.SPARK_EXECUTION_SESSION_ENABLED));
     register(
         settingBuilder,
         clusterSettings,
         Key.SPARK_EXECUTION_SESSION_LIMIT,
         SPARK_EXECUTION_SESSION_LIMIT_SETTING,
         new Updater(Key.SPARK_EXECUTION_SESSION_LIMIT));
+    register(
+        settingBuilder,
+        clusterSettings,
+        Key.SPARK_EXECUTION_REFRESH_JOB_LIMIT,
+        SPARK_EXECUTION_REFRESH_JOB_LIMIT_SETTING,
+        new Updater(Key.SPARK_EXECUTION_REFRESH_JOB_LIMIT));
     register(
         settingBuilder,
         clusterSettings,
@@ -350,8 +350,8 @@ public static List<Setting<?>> pluginSettings() {
         .add(METRICS_ROLLING_INTERVAL_SETTING)
         .add(DATASOURCE_URI_HOSTS_DENY_LIST)
         .add(SPARK_EXECUTION_ENGINE_CONFIG)
-        .add(SPARK_EXECUTION_SESSION_ENABLED_SETTING)
         .add(SPARK_EXECUTION_SESSION_LIMIT_SETTING)
+        .add(SPARK_EXECUTION_REFRESH_JOB_LIMIT_SETTING)
         .add(SESSION_INDEX_TTL_SETTING)
         .add(RESULT_INDEX_TTL_SETTING)
         .add(AUTO_INDEX_MANAGEMENT_ENABLED_SETTING)

diff --git a/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java b/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java
@@ -335,7 +335,8 @@ private AsyncQueryExecutorService createAsyncQueryExecutorService(
             new FlintIndexMetadataReaderImpl(client),
             client,
             new SessionManager(stateStore, emrServerlessClient, pluginSettings),
-            new DefaultLeaseManager(pluginSettings, stateStore));
+            new DefaultLeaseManager(pluginSettings, stateStore),
+            stateStore);
     return new AsyncQueryExecutorServiceImpl(
         asyncQueryJobMetadataStorageService,
         sparkQueryDispatcher,

diff --git a/spark/build.gradle b/spark/build.gradle
@@ -123,6 +123,7 @@ jacocoTestCoverageVerification {
                     'org.opensearch.sql.spark.execution.statestore.StateStore',
                     'org.opensearch.sql.spark.execution.session.SessionModel',
                     'org.opensearch.sql.spark.execution.statement.StatementModel',
+                    'org.opensearch.sql.spark.flint.FlintIndexStateModel',
                     // TODO: add tests for purging flint indices
                     'org.opensearch.sql.spark.cluster.ClusterManagerEventListener*',
                     'org.opensearch.sql.spark.cluster.FlintIndexRetention',

diff --git a/spark/src/main/java/org/opensearch/sql/spark/asyncquery/AsyncQueryExecutorServiceImpl.java b/spark/src/main/java/org/opensearch/sql/spark/asyncquery/AsyncQueryExecutorServiceImpl.java
@@ -72,7 +72,6 @@ public CreateAsyncQueryResponse createAsyncQuery(
             dispatchQueryResponse.getQueryId(),
             sparkExecutionEngineConfig.getApplicationId(),
             dispatchQueryResponse.getJobId(),
-            dispatchQueryResponse.isDropIndexQuery(),
             dispatchQueryResponse.getResultIndex(),
             dispatchQueryResponse.getSessionId()));
     return new CreateAsyncQueryResponse(

diff --git a/spark/src/main/java/org/opensearch/sql/spark/asyncquery/model/AsyncQueryJobMetadata.java b/spark/src/main/java/org/opensearch/sql/spark/asyncquery/model/AsyncQueryJobMetadata.java
@@ -29,7 +29,6 @@ public class AsyncQueryJobMetadata extends StateModel {
   private final AsyncQueryId queryId;
   private final String applicationId;
   private final String jobId;
-  private final boolean isDropIndexQuery;
   private final String resultIndex;
   // optional sessionId.
   private final String sessionId;
@@ -43,7 +42,6 @@ public AsyncQueryJobMetadata(
         queryId,
         applicationId,
         jobId,
-        false,
         resultIndex,
         null,
         SequenceNumbers.UNASSIGNED_SEQ_NO,
@@ -54,14 +52,12 @@ public AsyncQueryJobMetadata(
       AsyncQueryId queryId,
       String applicationId,
       String jobId,
-      boolean isDropIndexQuery,
       String resultIndex,
       String sessionId) {
     this(
         queryId,
         applicationId,
         jobId,
-        isDropIndexQuery,
         resultIndex,
         sessionId,
         SequenceNumbers.UNASSIGNED_SEQ_NO,
@@ -72,15 +68,13 @@ public AsyncQueryJobMetadata(
       AsyncQueryId queryId,
       String applicationId,
       String jobId,
-      boolean isDropIndexQuery,
       String resultIndex,
       String sessionId,
       long seqNo,
       long primaryTerm) {
     this.queryId = queryId;
     this.applicationId = applicationId;
     this.jobId = jobId;
-    this.isDropIndexQuery = isDropIndexQuery;
     this.resultIndex = resultIndex;
     this.sessionId = sessionId;
     this.seqNo = seqNo;
@@ -106,7 +100,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         .field("type", TYPE_JOBMETA)
         .field("jobId", jobId)
         .field("applicationId", applicationId)
-        .field("isDropIndexQuery", isDropIndexQuery)
         .field("resultIndex", resultIndex)
         .field("sessionId", sessionId)
         .endObject();
@@ -120,7 +113,6 @@ public static AsyncQueryJobMetadata copy(
         copy.getQueryId(),
         copy.getApplicationId(),
         copy.getJobId(),
-        copy.isDropIndexQuery(),
         copy.getResultIndex(),
         copy.getSessionId(),
         seqNo,
@@ -176,14 +168,7 @@ public static AsyncQueryJobMetadata fromXContent(
       throw new IllegalArgumentException("jobId and applicationId are required fields.");
     }
     return new AsyncQueryJobMetadata(
-        queryId,
-        applicationId,
-        jobId,
-        isDropIndexQuery,
-        resultIndex,
-        sessionId,
-        seqNo,
-        primaryTerm);
+        queryId, applicationId, jobId, resultIndex, sessionId, seqNo, primaryTerm);
   }
 
   @Override

diff --git a/spark/src/main/java/org/opensearch/sql/spark/dispatcher/AsyncQueryHandler.java b/spark/src/main/java/org/opensearch/sql/spark/dispatcher/AsyncQueryHandler.java
@@ -20,11 +20,6 @@
 public abstract class AsyncQueryHandler {
 
   public JSONObject getQueryResponse(AsyncQueryJobMetadata asyncQueryJobMetadata) {
-    if (asyncQueryJobMetadata.isDropIndexQuery()) {
-      return SparkQueryDispatcher.DropIndexResult.fromJobId(asyncQueryJobMetadata.getJobId())
-          .result();
-    }
-
     JSONObject result = getResponseFromResultIndex(asyncQueryJobMetadata);
     if (result.has(DATA_FIELD)) {
       JSONObject items = result.getJSONObject(DATA_FIELD);

diff --git a/spark/src/main/java/org/opensearch/sql/spark/dispatcher/BatchQueryHandler.java b/spark/src/main/java/org/opensearch/sql/spark/dispatcher/BatchQueryHandler.java
@@ -22,12 +22,15 @@
 import org.opensearch.sql.spark.dispatcher.model.DispatchQueryRequest;
 import org.opensearch.sql.spark.dispatcher.model.DispatchQueryResponse;
 import org.opensearch.sql.spark.dispatcher.model.JobType;
+import org.opensearch.sql.spark.leasemanager.LeaseManager;
+import org.opensearch.sql.spark.leasemanager.model.LeaseRequest;
 import org.opensearch.sql.spark.response.JobExecutionResponseReader;
 
 @RequiredArgsConstructor
 public class BatchQueryHandler extends AsyncQueryHandler {
   private final EMRServerlessClient emrServerlessClient;
   private final JobExecutionResponseReader jobExecutionResponseReader;
+  protected final LeaseManager leaseManager;
 
   @Override
   protected JSONObject getResponseFromResultIndex(AsyncQueryJobMetadata asyncQueryJobMetadata) {
@@ -60,6 +63,8 @@ public String cancelJob(AsyncQueryJobMetadata asyncQueryJobMetadata) {
   @Override
   public DispatchQueryResponse submit(
       DispatchQueryRequest dispatchQueryRequest, DispatchQueryContext context) {
+    leaseManager.borrow(new LeaseRequest(JobType.BATCH, dispatchQueryRequest.getDatasource()));
+
     String jobName = dispatchQueryRequest.getClusterName() + ":" + "non-index-query";
     Map<String, String> tags = context.getTags();
     DataSourceMetadata dataSourceMetadata = context.getDataSourceMetadata();
@@ -81,6 +86,6 @@ public DispatchQueryResponse submit(
             dataSourceMetadata.getResultIndex());
     String jobId = emrServerlessClient.startJobRun(startJobRequest);
     return new DispatchQueryResponse(
-        context.getQueryId(), jobId, false, dataSourceMetadata.getResultIndex(), null);
+        context.getQueryId(), jobId, dataSourceMetadata.getResultIndex(), null);
   }
 }