Make lint-file work

Signed-off-by: Carmen Bianca BAKKER <[email protected]>
fsfe · Sep 6, 2024 · 46e42a1 · 46e42a1
1 parent f7f6586
commit 46e42a1
Show file tree

Hide file tree

Showing 10 changed files with 483 additions and 176 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -11,7 +11,8 @@ jobs=0
 
 disable=duplicate-code,
          logging-fstring-interpolation,
-         implicit-str-concat
+         implicit-str-concat,
+         inconsistent-quotes
 enable=useless-suppression
 
 [REPORTS]

diff --git a/src/reuse/lint_file.py → src/reuse/_lint_file.py b/src/reuse/lint_file.py → src/reuse/_lint_file.py
@@ -9,11 +9,13 @@
 import sys
 from argparse import ArgumentParser, Namespace
 from gettext import gettext as _
+from pathlib import Path
 from typing import IO
 
-from .lint import format_json, format_lines, format_plain
+from ._util import PathType
+from .lint import format_lines_subset
 from .project import Project
-from .report import ProjectReport
+from .report import ProjectSubsetReport
 
 
 def add_arguments(parser: ArgumentParser) -> None:
@@ -22,40 +24,34 @@ def add_arguments(parser: ArgumentParser) -> None:
     mutex_group.add_argument(
         "-q", "--quiet", action="store_true", help=_("prevents output")
     )
-    mutex_group.add_argument(
-        "-j", "--json", action="store_true", help=_("formats output as JSON")
-    )
-    mutex_group.add_argument(
-        "-p",
-        "--plain",
-        action="store_true",
-        help=_("formats output as plain text"),
-    )
     mutex_group.add_argument(
         "-l",
         "--lines",
         action="store_true",
-        help=_("formats output as errors per line"),
+        help=_("formats output as errors per line (default)"),
     )
-    parser.add_argument("files", nargs="*")
+    parser.add_argument("files", action="store", nargs="*", type=PathType("r"))
 
 
 def run(args: Namespace, project: Project, out: IO[str] = sys.stdout) -> int:
     """List all non-compliant files from specified file list."""
-    report = ProjectReport.generate(
+    subset_files = {Path(file_) for file_ in args.files}
+    for file_ in subset_files:
+        if not file_.resolve().is_relative_to(project.root.resolve()):
+            args.parser.error(
+                _("'{file}' is not inside of '{root}'").format(
+                    file=file_, root=project.root
+                )
+            )
+    report = ProjectSubsetReport.generate(
         project,
-        do_checksum=False,
-        file_list=args.files,
+        subset_files,
         multiprocessing=not args.no_multiprocessing,
     )
 
     if args.quiet:
         pass
-    elif args.json:
-        out.write(format_json(report))
-    elif args.lines:
-        out.write(format_lines(report))
     else:
-        out.write(format_plain(report))
+        out.write(format_lines_subset(report))
 
     return 0 if report.is_compliant else 1
diff --git a/src/reuse/_main.py b/src/reuse/_main.py
@@ -21,10 +21,10 @@
     __REUSE_version__,
     __version__,
     _annotate,
+    _lint_file,
     convert_dep5,
     download,
     lint,
-    lint_file,
     spdx,
     supported_licenses,
 )
@@ -178,8 +178,8 @@ def parser() -> argparse.ArgumentParser:
     add_command(
         subparsers,
         "lint-file",
-        lint_file.add_arguments,
-        lint_file.run,
+        _lint_file.add_arguments,
+        _lint_file.run,
         help=_("list non-compliant files from specified list of files"),
     )
 

diff --git a/src/reuse/lint.py b/src/reuse/lint.py
@@ -20,7 +20,7 @@
 
 from . import __REUSE_version__
 from .project import Project
-from .report import ProjectReport
+from .report import ProjectReport, ProjectReportSubsetProtocol
 
 
 def add_arguments(parser: ArgumentParser) -> None:
@@ -36,7 +36,7 @@ def add_arguments(parser: ArgumentParser) -> None:
         "-p",
         "--plain",
         action="store_true",
-        help=_("formats output as plain text"),
+        help=_("formats output as plain text (default)"),
     )
     mutex_group.add_argument(
         "-l",
@@ -264,13 +264,43 @@ def custom_serializer(obj: Any) -> Any:
     )
 
 
+def format_lines_subset(report: ProjectReportSubsetProtocol) -> str:
+    """Formats a subset of a report, namely missing licenses, read errors, files
+    without licenses, and files without copyright.
+
+    Args:
+        report: A populated report.
+    """
+    output = StringIO()
+
+    # Missing licenses
+    for lic, files in sorted(report.missing_licenses.items()):
+        for path in sorted(files):
+            output.write(
+                _("{path}: missing license {lic}\n").format(path=path, lic=lic)
+            )
+
+    # Read errors
+    for path in sorted(report.read_errors):
+        output.write(_("{path}: read error\n").format(path=path))
+
+    # Without licenses
+    for path in report.files_without_licenses:
+        output.write(_("{path}: no license identifier\n").format(path=path))
+
+    # Without copyright
+    for path in report.files_without_copyright:
+        output.write(_("{path}: no copyright notice\n").format(path=path))
+
+    return output.getvalue()
+
+
 def format_lines(report: ProjectReport) -> str:
-    """Formats data dictionary as plaintext strings to be printed to sys.stdout
-    Sorting of output is not guaranteed.
-    Symbolic links can result in multiple entries per file.
+    """Formats report as plaintext strings to be printed to sys.stdout. Sorting
+    of output is not guaranteed.
 
     Args:
-        report: ProjectReport data
+        report: A populated report.
 
     Returns:
         String (in plaintext) that can be output to sys.stdout
@@ -281,6 +311,7 @@ def license_path(lic: str) -> Optional[Path]:
         """Resolve a license identifier to a license path."""
         return report.licenses.get(lic)
 
+    subset_output = ""
     if not report.is_compliant:
         # Bad licenses
         for lic, files in sorted(report.bad_licenses.items()):
@@ -312,28 +343,10 @@ def license_path(lic: str) -> Optional[Path]:
                 _("{lic_path}: unused license\n").format(lic_path=lic_path)
             )
 
-        # Missing licenses
-        for lic, files in sorted(report.missing_licenses.items()):
-            for path in sorted(files):
-                output.write(
-                    _("{path}: missing license {lic}\n").format(
-                        path=path, lic=lic
-                    )
-                )
-
-        # Read errors
-        for path in sorted(report.read_errors):
-            output.write(_("{path}: read error\n").format(path=path))
-
-        # Without licenses
-        for path in report.files_without_licenses:
-            output.write(_("{path}: no license identifier\n").format(path=path))
-
-        # Without copyright
-        for path in report.files_without_copyright:
-            output.write(_("{path}: no copyright notice\n").format(path=path))
+        # Everything else.
+        subset_output = format_lines_subset(report)
 
-    return output.getvalue()
+    return output.getvalue() + subset_output
 
 
 def run(args: Namespace, project: Project, out: IO[str] = sys.stdout) -> int:

diff --git a/src/reuse/project.py b/src/reuse/project.py
@@ -18,7 +18,18 @@
 from collections import defaultdict
 from gettext import gettext as _
 from pathlib import Path
-from typing import DefaultDict, Dict, Iterator, List, NamedTuple, Optional, Type
+from typing import (
+    Collection,
+    DefaultDict,
+    Dict,
+    Iterator,
+    List,
+    NamedTuple,
+    Optional,
+    Set,
+    Type,
+    cast,
+)
 
 from binaryornot.check import is_binary
 
@@ -158,53 +169,19 @@ def from_directory(
 
         return project
 
-    def specific_files(
-        self, files: Optional[List], directory: Optional[StrPath] = None
+    def _iter_files(
+        self,
+        directory: Optional[StrPath] = None,
+        subset_files: Optional[Collection[StrPath]] = None,
     ) -> Iterator[Path]:
-        """Yield all files in the specified file list within a directory.
-
-        The files that are not yielded are:
-
-        - Files ignored by VCS (e.g., see .gitignore)
-
-        - Files matching IGNORE_*_PATTERNS.
-        """
-        if directory is None:
-            directory = self.root
-        directory = Path(directory)
-
-        if files is not None:
-            # Filter files.
-            for file_ in files:
-                the_file = directory / file_
-                if self._is_path_ignored(the_file):
-                    _LOGGER.debug("ignoring '%s'", the_file)
-                    continue
-                if the_file.is_symlink():
-                    _LOGGER.debug("skipping symlink '%s'", the_file)
-                    continue
-                # Suppressing this error because I simply don't want to deal
-                # with that here.
-                with contextlib.suppress(OSError):
-                    if the_file.stat().st_size == 0:
-                        _LOGGER.debug("skipping 0-sized file '%s'", the_file)
-                        continue
-
-                _LOGGER.debug("yielding '%s'", the_file)
-                yield the_file
-
-    def all_files(self, directory: Optional[StrPath] = None) -> Iterator[Path]:
-        """Yield all files in *directory* and its subdirectories.
-
-        The files that are not yielded are:
-
-        - Files ignored by VCS (e.g., see .gitignore)
-
-        - Files/directories matching IGNORE_*_PATTERNS.
-        """
+        # pylint: disable=too-many-branches
         if directory is None:
             directory = self.root
         directory = Path(directory)
+        if subset_files is not None:
+            subset_files = cast(
+                Set[Path], {Path(file_).resolve() for file_ in subset_files}
+            )
 
         for root_str, dirs, files in os.walk(directory):
             root = Path(root_str)
@@ -213,6 +190,11 @@ def all_files(self, directory: Optional[StrPath] = None) -> Iterator[Path]:
             # Don't walk ignored directories
             for dir_ in list(dirs):
                 the_dir = root / dir_
+                if subset_files is not None and not any(
+                    file_.is_relative_to(the_dir.resolve())
+                    for file_ in subset_files
+                ):
+                    continue
                 if self._is_path_ignored(the_dir):
                     _LOGGER.debug("ignoring '%s'", the_dir)
                     dirs.remove(dir_)
@@ -231,6 +213,11 @@ def all_files(self, directory: Optional[StrPath] = None) -> Iterator[Path]:
             # Filter files.
             for file_ in files:
                 the_file = root / file_
+                if (
+                    subset_files is not None
+                    and the_file.resolve() not in subset_files
+                ):
+                    continue
                 if self._is_path_ignored(the_file):
                     _LOGGER.debug("ignoring '%s'", the_file)
                     continue
@@ -247,6 +234,42 @@ def all_files(self, directory: Optional[StrPath] = None) -> Iterator[Path]:
                 _LOGGER.debug("yielding '%s'", the_file)
                 yield the_file
 
+    def all_files(self, directory: Optional[StrPath] = None) -> Iterator[Path]:
+        """Yield all files in *directory* and its subdirectories.
+
+        The files that are not yielded are those explicitly ignored by the REUSE
+        Specification. That means:
+
+        - LICENSE/COPYING files.
+        - VCS directories.
+        - .license files.
+        - .spdx files.
+        - Files ignored by VCS.
+        - Symlinks.
+        - Submodules (depending on the value of :attr:`include_submodules`).
+        - Meson subprojects (depending on the value of
+              :attr:`include_meson_subprojects`).
+        - 0-sized files.
+
+        Args:
+            directory: The directory in which to search.
+        """
+        return self._iter_files(directory=directory)
+
+    def subset_files(
+        self, files: Collection[StrPath], directory: Optional[StrPath] = None
+    ) -> Iterator[Path]:
+        """Like :meth:`all_files`, but all files that are not in *files* are
+        filtered out.
+
+        Args:
+            files: A collection of paths relative to the current working
+                directory. Any files that are not in this collection are not
+                yielded.
+            directory: The directory in which to search.
+        """
+        return self._iter_files(directory=directory, subset_files=files)
+
     def reuse_info_of(self, path: StrPath) -> List[ReuseInfo]:
         """Return REUSE info of *path*.