diff --git a/CHANGELOG.md b/CHANGELOG.md index 93a98de2c5af8..4df028f12d2fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - [Remote Publication] Add remote download stats ([#15291](https://github.com/opensearch-project/OpenSearch/pull/15291))) - Add support for comma-separated list of index names to be used with Snapshot Status API ([#15409](https://github.com/opensearch-project/OpenSearch/pull/15409)) - Add prefix support to hashed prefix & infix path types on remote store ([#15557](https://github.com/opensearch-project/OpenSearch/pull/15557)) +- Optimise snapshot deletion to speed up snapshot deletion and creation ([#15568](https://github.com/opensearch-project/OpenSearch/pull/15568)) - [Remote Publication] Added checksum validation for cluster state behind a cluster setting ([#15218](https://github.com/opensearch-project/OpenSearch/pull/15218)) ### Dependencies diff --git a/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java index d193ab3c14154..aff48f717eb44 100644 --- a/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java @@ -66,6 +66,7 @@ import org.opensearch.common.Nullable; import org.opensearch.common.Numbers; import org.opensearch.common.Priority; +import org.opensearch.common.Randomness; import org.opensearch.common.SetOnce; import org.opensearch.common.UUIDs; import org.opensearch.common.blobstore.BlobContainer; @@ -831,7 +832,7 @@ boolean getPrefixModeVerification() { * maintains single lazy instance of {@link BlobContainer} */ protected BlobContainer blobContainer() { - // assertSnapshotOrGenericThread(); + assertSnapshotOrGenericThread(); BlobContainer blobContainer = this.blobContainer.get(); if (blobContainer == null) { @@ -1204,6 +1205,10 @@ private void asyncCleanupUnlinkedShardLevelBlobs( ActionListener listener ) { final List> filesToDelete = resolveFilesToDelete(oldRepositoryData, snapshotIds, deleteResults); + long startTimeNs = System.nanoTime(); + Randomness.shuffle(filesToDelete); + logger.debug("[{}] shuffled the filesToDelete with timeElapsedNs={}", metadata.name(), (System.nanoTime() - startTimeNs)); + if (filesToDelete.isEmpty()) { listener.onResponse(null); return; @@ -1221,8 +1226,8 @@ private void asyncCleanupUnlinkedShardLevelBlobs( staleFilesToDeleteInBatch.size() ); - // Start as many workers as fit into the snapshot pool at once at the most - final int workers = Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(), staleFilesToDeleteInBatch.size()); + // Start as many workers as fit into the snapshot_deletion pool at once at the most + final int workers = Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT_DELETION).getMax(), staleFilesToDeleteInBatch.size()); for (int i = 0; i < workers; ++i) { executeStaleShardDelete(staleFilesToDeleteInBatch, remoteStoreLockManagerFactory, groupedListener); } @@ -1326,7 +1331,7 @@ private void executeStaleShardDelete( if (filesToDelete == null) { return; } - threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(ActionRunnable.wrap(listener, l -> { + threadPool.executor(ThreadPool.Names.SNAPSHOT_DELETION).execute(ActionRunnable.wrap(listener, l -> { try { // filtering files for which remote store lock release and cleanup succeeded, // remaining files for which it failed will be retried in next snapshot delete run. @@ -1390,7 +1395,7 @@ private void writeUpdatedShardMetaDataAndComputeDeletes( ActionListener> onAllShardsCompleted ) { - final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT); + final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT_DELETION); final List indices = oldRepositoryData.indicesToUpdateAfterRemovingSnapshot(snapshotIds); if (indices.isEmpty()) { @@ -1578,7 +1583,7 @@ private void cleanupStaleBlobs( listener.onResponse(deleteResult); }, listener::onFailure), 2); - final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT); + final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT_DELETION); final List staleRootBlobs = staleRootBlobs(newRepoData, rootBlobs.keySet()); if (staleRootBlobs.isEmpty()) { groupedListener.onResponse(DeleteResult.ZERO); @@ -1781,7 +1786,7 @@ void cleanupStaleIndices( // Start as many workers as fit into the snapshot pool at once at the most final int workers = Math.min( - threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(), + threadPool.info(ThreadPool.Names.SNAPSHOT_DELETION).getMax(), foundIndices.size() - survivingIndexIds.size() ); for (int i = 0; i < workers; ++i) { @@ -1833,7 +1838,7 @@ private void executeOneStaleIndexDelete( return; } final String indexSnId = indexEntry.getKey(); - threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(ActionRunnable.supply(listener, () -> { + threadPool.executor(ThreadPool.Names.SNAPSHOT_DELETION).execute(ActionRunnable.supply(listener, () -> { try { logger.debug("[{}] Found stale index [{}]. Cleaning it up", metadata.name(), indexSnId); List matchingShardPaths = findMatchingShardPaths(indexSnId, snapshotShardPaths); @@ -2097,8 +2102,7 @@ public void finalizeSnapshot( stateTransformer, repositoryUpdatePriority, ActionListener.wrap(newRepoData -> { - cleanupOldShardGens(existingRepositoryData, updatedRepositoryData); - listener.onResponse(newRepoData); + cleanupOldShardGens(existingRepositoryData, updatedRepositoryData, newRepoData, listener); }, onUpdateFailure) ); }, onUpdateFailure), 2 + indices.size()); @@ -2254,7 +2258,12 @@ private void logShardPathsOperationWarning(IndexId indexId, SnapshotId snapshotI } // Delete all old shard gen blobs that aren't referenced any longer as a result from moving to updated repository data - private void cleanupOldShardGens(RepositoryData existingRepositoryData, RepositoryData updatedRepositoryData) { + private void cleanupOldShardGens( + RepositoryData existingRepositoryData, + RepositoryData updatedRepositoryData, + RepositoryData newRepositoryData, + ActionListener listener + ) { final List toDelete = new ArrayList<>(); updatedRepositoryData.shardGenerations() .obsoleteShardGenerations(existingRepositoryData.shardGenerations()) @@ -2263,10 +2272,62 @@ private void cleanupOldShardGens(RepositoryData existingRepositoryData, Reposito (shardId, oldGen) -> toDelete.add(shardPath(indexId, shardId).buildAsString() + INDEX_FILE_PREFIX + oldGen) ) ); + if (toDelete.isEmpty()) { + listener.onResponse(newRepositoryData); + return; + } try { - deleteFromContainer(rootBlobContainer(), toDelete); + AtomicInteger counter = new AtomicInteger(); + Collection> subList = toDelete.stream() + .collect(Collectors.groupingBy(it -> counter.getAndIncrement() / maxShardBlobDeleteBatch)) + .values(); + final BlockingQueue> staleFilesToDeleteInBatch = new LinkedBlockingQueue<>(subList); + logger.info( + "[{}] cleanupOldShardGens toDeleteSize={} groupSize={}", + metadata.name(), + toDelete.size(), + staleFilesToDeleteInBatch.size() + ); + final GroupedActionListener groupedListener = new GroupedActionListener<>(ActionListener.wrap(r -> { + logger.info("[{}] completed cleanupOldShardGens", metadata.name()); + listener.onResponse(newRepositoryData); + }, ex -> { + logger.error(new ParameterizedMessage("[{}] exception in cleanupOldShardGens", metadata.name()), ex); + listener.onResponse(newRepositoryData); + }), staleFilesToDeleteInBatch.size()); + + // Start as many workers as fit into the snapshot pool at once at the most + final int workers = Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT_DELETION).getMax(), staleFilesToDeleteInBatch.size()); + for (int i = 0; i < workers; ++i) { + executeOldShardGensCleanup(staleFilesToDeleteInBatch, groupedListener); + } } catch (Exception e) { - logger.warn("Failed to clean up old shard generation blobs", e); + logger.warn(new ParameterizedMessage(" [{}] Failed to clean up old shard generation blobs", metadata.name()), e); + listener.onResponse(newRepositoryData); + } + } + + private void executeOldShardGensCleanup(BlockingQueue> staleFilesToDeleteInBatch, GroupedActionListener listener) + throws InterruptedException { + List filesToDelete = staleFilesToDeleteInBatch.poll(0L, TimeUnit.MILLISECONDS); + if (filesToDelete != null) { + threadPool.executor(ThreadPool.Names.SNAPSHOT_DELETION).execute(ActionRunnable.wrap(listener, l -> { + try { + deleteFromContainer(rootBlobContainer(), filesToDelete); + l.onResponse(null); + } catch (Exception e) { + logger.warn( + () -> new ParameterizedMessage( + "[{}] Failed to delete following blobs during cleanupOldFiles : {}", + metadata.name(), + filesToDelete + ), + e + ); + l.onFailure(e); + } + executeOldShardGensCleanup(staleFilesToDeleteInBatch, listener); + })); } } @@ -2383,10 +2444,11 @@ public long getRemoteDownloadThrottleTimeInNanos() { } protected void assertSnapshotOrGenericThread() { - assert Thread.currentThread().getName().contains('[' + ThreadPool.Names.SNAPSHOT + ']') + assert Thread.currentThread().getName().contains('[' + ThreadPool.Names.SNAPSHOT_DELETION + ']') + || Thread.currentThread().getName().contains('[' + ThreadPool.Names.SNAPSHOT + ']') || Thread.currentThread().getName().contains('[' + ThreadPool.Names.GENERIC + ']') : "Expected current thread [" + Thread.currentThread() - + "] to be the snapshot or generic thread."; + + "] to be the snapshot_deletion or snapshot or generic thread."; } @Override diff --git a/server/src/main/java/org/opensearch/threadpool/ThreadPool.java b/server/src/main/java/org/opensearch/threadpool/ThreadPool.java index 056ef0fac0153..81220ab171b34 100644 --- a/server/src/main/java/org/opensearch/threadpool/ThreadPool.java +++ b/server/src/main/java/org/opensearch/threadpool/ThreadPool.java @@ -105,6 +105,7 @@ public static class Names { public static final String REFRESH = "refresh"; public static final String WARMER = "warmer"; public static final String SNAPSHOT = "snapshot"; + public static final String SNAPSHOT_DELETION = "snapshot_deletion"; public static final String FORCE_MERGE = "force_merge"; public static final String FETCH_SHARD_STARTED = "fetch_shard_started"; public static final String FETCH_SHARD_STORE = "fetch_shard_store"; @@ -176,6 +177,7 @@ public static ThreadPoolType fromType(String type) { map.put(Names.REFRESH, ThreadPoolType.SCALING); map.put(Names.WARMER, ThreadPoolType.SCALING); map.put(Names.SNAPSHOT, ThreadPoolType.SCALING); + map.put(Names.SNAPSHOT_DELETION, ThreadPoolType.SCALING); map.put(Names.FORCE_MERGE, ThreadPoolType.FIXED); map.put(Names.FETCH_SHARD_STARTED, ThreadPoolType.SCALING); map.put(Names.FETCH_SHARD_STORE, ThreadPoolType.SCALING); @@ -234,6 +236,7 @@ public ThreadPool( final int halfProcMaxAt5 = halfAllocatedProcessorsMaxFive(allocatedProcessors); final int halfProcMaxAt10 = halfAllocatedProcessorsMaxTen(allocatedProcessors); final int genericThreadPoolMax = boundedBy(4 * allocatedProcessors, 128, 512); + final int snapshotDeletionPoolMax = boundedBy(4 * allocatedProcessors, 64, 256); builders.put(Names.GENERIC, new ScalingExecutorBuilder(Names.GENERIC, 4, genericThreadPoolMax, TimeValue.timeValueSeconds(30))); builders.put(Names.WRITE, new FixedExecutorBuilder(settings, Names.WRITE, allocatedProcessors, 10000)); builders.put(Names.GET, new FixedExecutorBuilder(settings, Names.GET, allocatedProcessors, 1000)); @@ -251,6 +254,10 @@ public ThreadPool( builders.put(Names.REFRESH, new ScalingExecutorBuilder(Names.REFRESH, 1, halfProcMaxAt10, TimeValue.timeValueMinutes(5))); builders.put(Names.WARMER, new ScalingExecutorBuilder(Names.WARMER, 1, halfProcMaxAt5, TimeValue.timeValueMinutes(5))); builders.put(Names.SNAPSHOT, new ScalingExecutorBuilder(Names.SNAPSHOT, 1, halfProcMaxAt5, TimeValue.timeValueMinutes(5))); + builders.put( + Names.SNAPSHOT_DELETION, + new ScalingExecutorBuilder(Names.SNAPSHOT_DELETION, 1, snapshotDeletionPoolMax, TimeValue.timeValueMinutes(5)) + ); builders.put( Names.FETCH_SHARD_STARTED, new ScalingExecutorBuilder(Names.FETCH_SHARD_STARTED, 1, 2 * allocatedProcessors, TimeValue.timeValueMinutes(5)) diff --git a/server/src/test/java/org/opensearch/threadpool/ScalingThreadPoolTests.java b/server/src/test/java/org/opensearch/threadpool/ScalingThreadPoolTests.java index d8f04a11fe494..4f59f9688fb7e 100644 --- a/server/src/test/java/org/opensearch/threadpool/ScalingThreadPoolTests.java +++ b/server/src/test/java/org/opensearch/threadpool/ScalingThreadPoolTests.java @@ -148,6 +148,7 @@ private int expectedSize(final String threadPoolName, final int numberOfProcesso sizes.put(ThreadPool.Names.REFRESH, ThreadPool::halfAllocatedProcessorsMaxTen); sizes.put(ThreadPool.Names.WARMER, ThreadPool::halfAllocatedProcessorsMaxFive); sizes.put(ThreadPool.Names.SNAPSHOT, ThreadPool::halfAllocatedProcessorsMaxFive); + sizes.put(ThreadPool.Names.SNAPSHOT_DELETION, n -> ThreadPool.boundedBy(4 * n, 64, 256)); sizes.put(ThreadPool.Names.FETCH_SHARD_STARTED, ThreadPool::twiceAllocatedProcessors); sizes.put(ThreadPool.Names.FETCH_SHARD_STORE, ThreadPool::twiceAllocatedProcessors); sizes.put(ThreadPool.Names.TRANSLOG_TRANSFER, ThreadPool::halfAllocatedProcessors);