diff --git a/src/borg/archiver/__init__.py b/src/borg/archiver/__init__.py index c9e1236fa0..b3f0b308c2 100644 --- a/src/borg/archiver/__init__.py +++ b/src/borg/archiver/__init__.py @@ -64,6 +64,7 @@ def get_func(args): raise Exception("expected func attributes not found") +from .analyze_cmd import AnalyzeMixIn from .benchmark_cmd import BenchmarkMixIn from .check_cmd import CheckMixIn from .compact_cmd import CompactMixIn @@ -94,6 +95,7 @@ def get_func(args): class Archiver( + AnalyzeMixIn, BenchmarkMixIn, CheckMixIn, CompactMixIn, @@ -332,6 +334,7 @@ def build_parser(self): subparsers = parser.add_subparsers(title="required arguments", metavar="") + self.build_parser_analyze(subparsers, common_parser, mid_common_parser) self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser) self.build_parser_check(subparsers, common_parser, mid_common_parser) self.build_parser_compact(subparsers, common_parser, mid_common_parser) diff --git a/src/borg/archiver/analyze_cmd.py b/src/borg/archiver/analyze_cmd.py new file mode 100644 index 0000000000..2317376284 --- /dev/null +++ b/src/borg/archiver/analyze_cmd.py @@ -0,0 +1,116 @@ +import argparse +from collections import defaultdict +import os + +from ._common import with_repository, define_archive_filters_group +from ..archive import Archive +from ..constants import * # NOQA +from ..helpers import bin_to_hex, Error +from ..helpers import ProgressIndicatorPercent +from ..manifest import Manifest +from ..remote import RemoteRepository +from ..repository import Repository + +from ..logger import create_logger + +logger = create_logger() + + +class ArchiveAnalyzer: + def __init__(self, args, repository, manifest): + self.args = args + self.repository = repository + assert isinstance(repository, (Repository, RemoteRepository)) + self.manifest = manifest + self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed + + def analyze(self): + logger.info("Starting archives analysis...") + self.analyze_archives() + self.report() + logger.info("Finished archives analysis.") + + def analyze_archives(self) -> None: + """Analyze all archives matching the given selection criteria.""" + archive_infos = self.manifest.archives.list_considering(self.args) + num_archives = len(archive_infos) + if num_archives < 2: + raise Error("Need at least 2 archives to analyze.") + + pi = ProgressIndicatorPercent( + total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives" + ) + i = 0 + info = archive_infos[i] + pi.show(i) + logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})") + base = self.analyze_archive(info.id) + for i, info in enumerate(archive_infos[1:]): + pi.show(i + 1) + logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})") + new = self.analyze_archive(info.id) + self.analyze_change(base, new) + base = new + pi.finish() + + def analyze_archive(self, id): + """compute the set of chunks for each directory in this archive""" + archive = Archive(self.manifest, id) + chunks_by_path = defaultdict(set) # collect all chunk IDs generated from files in this directory path + for item in archive.iter_items(): + if "chunks" in item: + item_chunks = set(id for id, size in item.chunks) + directory_path = os.path.dirname(item.path) + chunks_by_path[directory_path].update(item_chunks) + return chunks_by_path + + def analyze_change(self, base, new): + """for each directory path, count the chunks changed (removed or added chunks) between base and new.""" + + def analyze_path_change(path): + base_chunks = base[path] + new_chunks = new[path] + different_chunks = base_chunks.symmetric_difference(new_chunks) # removed or added chunks + self.difference_by_path[directory_path] += len(different_chunks) + + for directory_path in base: + analyze_path_change(directory_path) + for directory_path in new: + if directory_path not in base: + analyze_path_change(directory_path) + + def report(self): + print() + print("chunks added or removed by directory path") + print("=========================================") + for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True): + difference = self.difference_by_path[directory_path] + if difference > 0: + print(f"{directory_path}: {difference}") + + +class AnalyzeMixIn: + @with_repository(compatibility=(Manifest.Operation.READ,)) + def do_analyze(self, args, repository, manifest): + """Analyze archives""" + ArchiveAnalyzer(args, repository, manifest).analyze() + + def build_parser_analyze(self, subparsers, common_parser, mid_common_parser): + from ._common import process_epilog + + analyze_epilog = process_epilog( + """ + Analyze archives. + """ + ) + subparser = subparsers.add_parser( + "analyze", + parents=[common_parser], + add_help=False, + description=self.do_analyze.__doc__, + epilog=analyze_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help="analyze archives", + ) + subparser.set_defaults(func=self.do_analyze) + define_archive_filters_group(subparser) diff --git a/src/borg/testsuite/archiver/analyze_cmd.py b/src/borg/testsuite/archiver/analyze_cmd.py new file mode 100644 index 0000000000..a2c30fcdfa --- /dev/null +++ b/src/borg/testsuite/archiver/analyze_cmd.py @@ -0,0 +1,41 @@ +import pathlib + +from ...constants import * # NOQA +from . import cmd, generate_archiver_tests, RK_ENCRYPTION + +pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA + + +def test_analyze(archivers, request): + def create_archive(): + cmd(archiver, "create", "archive", archiver.input_path) + + def analyze_archives(): + return cmd(archiver, "analyze", "-a", "archive") + + archiver = request.getfixturevalue(archivers) + + cmd(archiver, "repo-create", RK_ENCRYPTION) + input_path = pathlib.Path(archiver.input_path) + + # 1st archive + (input_path / "file1").write_text("foo") + create_archive() + + # 2nd archive + (input_path / "file2").write_text("bar") + create_archive() + + assert "/input: 1" in analyze_archives() # 2nd archive added 1 chunk for input path + + # 3rd archive + (input_path / "file3").write_text("baz") + create_archive() + + assert "/input: 2" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path + + # 4th archive + (input_path / "file2").unlink() + create_archive() + + assert "/input: 3" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1