From 92aee19aed0500893d48c7140d68e9c83e0be0ab Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Wed, 14 Aug 2024 11:22:57 -0700 Subject: [PATCH] feat: support metric custom column ordering and sub-setting Fixes #177 --- fgpyo/util/inspect.py | 2 +- fgpyo/util/metric.py | 77 ++++++++++++++++++++++++++++----- tests/fgpyo/util/test_metric.py | 73 +++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+), 12 deletions(-) diff --git a/fgpyo/util/inspect.py b/fgpyo/util/inspect.py index a2f655b..ab002a1 100644 --- a/fgpyo/util/inspect.py +++ b/fgpyo/util/inspect.py @@ -332,7 +332,7 @@ def get_parser() -> partial: # noqa: C901 nonlocal type_ nonlocal parsers try: - return functools.partial(parsers[type_]) + return functools.partial(parsers[type_]) # type: ignore[misc] except KeyError as ex: if ( type_ in [str, int, float] diff --git a/fgpyo/util/metric.py b/fgpyo/util/metric.py index f1b5d49..aa491ee 100644 --- a/fgpyo/util/metric.py +++ b/fgpyo/util/metric.py @@ -111,6 +111,32 @@ >>> Person(name=Name(first='john', last='doe'), age=42, address=None).formatted_values() ["first last", "42"] ``` + +Re-ordering and sub-setting the columns when writing is supported by overriding the `_header()` +method. In the example below, the `weight` field is not written, but is optional to support reading +the metric back in. Also, the `name` and `age` columns are written in reverse order. + +```python + >>> from fgpyo.util.metric import Metric + >>> import attr + >>> @attr.s(auto_attribs=True, frozen=True) + ... class Person(Metric["Person"]): + ... name: str + ... age: int + ... weight: Optional[int] + ... @classmethod + ... def _header(cls, field_types: Optional[List[FieldType]] = None) -> List[str]: + ... return ["age", "name"] + >>> person = Person(name="john", age=42, weight=180) + >>> person.header() + ["age", "name"] + >>> list(person.values()) + [42, "john"] + >>> Person.write(Path("/path/to/metrics.txt"), person) + >>> list(Person.read(Path("/path/to/metrics.txt"))) + [Person(name="john", age=42, weight=None)] +``` + """ from abc import ABC @@ -122,10 +148,13 @@ from typing import Generic from typing import Iterator from typing import List +from typing import Optional from typing import TypeVar +from typing import final from fgpyo import io from fgpyo.util import inspect +from fgpyo.util.inspect import FieldType MetricType = TypeVar("MetricType", bound="Metric") @@ -141,19 +170,21 @@ class Metric(ABC, Generic[MetricType]): [`format_value()`][fgpyo.util.metric.Metric.format_value]. """ - def values(self) -> Iterator[Any]: + def values(self, _header: Optional[List[str]] = None) -> Iterator[Any]: """An iterator over attribute values in the same order as the header.""" - for field in inspect.get_fields(self.__class__): # type: ignore[arg-type] - yield getattr(self, field.name) + if _header is None: + _header = self.header() + for name in _header: + yield getattr(self, name) - def formatted_values(self) -> List[str]: + def formatted_values(self, _header: Optional[List[str]] = None) -> List[str]: """An iterator over formatted attribute values in the same order as the header.""" - return [self.format_value(value) for value in self.values()] + return [self.format_value(value) for value in self.values(_header=_header)] @classmethod def _parsers(cls) -> Dict[type, Callable[[str], Any]]: """Mapping of type to a specific parser for that type. The parser must accept a string - as a single parameter and return a single value of the given type. Sub-classes may + as a single parameter and return a single value of the given type. Subclasses may override this method to support custom types.""" return {} @@ -245,21 +276,45 @@ def write(cls, path: Path, *values: MetricType) -> None: Args: path: Path to the output file. values: Zero or more metrics. - """ + header = cls.header() with io.to_writer(path) as writer: writer.write("\t".join(cls.header())) writer.write("\n") for value in values: # Important, don't recurse on nested attr classes, instead let implementing class # implement format_value. - writer.write("\t".join(cls.format_value(item) for item in value.values())) + writer.write( + "\t".join(cls.format_value(item) for item in value.values(_header=header)) + ) writer.write("\n") @classmethod + @final def header(cls) -> List[str]: - """The list of header values for the metric.""" - return [a.name for a in inspect.get_fields(cls)] # type: ignore[arg-type] + """The list of header values for the metric. + + This method may be overridden to re-order or subset the columns written to file with + `write()` or returned by `values()`. + """ + field_types = list(inspect.get_fields(cls)) # type: ignore[arg-type] + field_names = {field.name for field in field_types} + header = cls._header(field_types=field_types) + extra_fields = [h for h in header if h not in field_names] + if len(extra_fields) > 0: + raise ValueError("header() returned extra fields: " + ", ".join(extra_fields)) + return header + + @classmethod + def _header(cls, field_types: Optional[List[FieldType]] = None) -> List[str]: + """Returns a mapping of a field to the index it should be returned in `values()` + based on the field names returned by `header()`. + + This is useful for re-ordering or sub-setting the output columns in `write()`. + """ + if field_types is None: + field_types = list(inspect.get_fields(cls)) # type: ignore[arg-type] + return [a.name for a in field_types] @classmethod def format_value(cls, value: Any) -> str: # noqa: C901 @@ -315,7 +370,7 @@ def format_value(cls, value: Any) -> str: # noqa: C901 @classmethod def to_list(cls, value: str) -> List[Any]: - """Returns a list value split on comma delimeter.""" + """Returns a list value split on comma delimiter.""" return [] if value == "" else value.split(",") @staticmethod diff --git a/tests/fgpyo/util/test_metric.py b/tests/fgpyo/util/test_metric.py index 40d19c1..a07898c 100644 --- a/tests/fgpyo/util/test_metric.py +++ b/tests/fgpyo/util/test_metric.py @@ -26,6 +26,7 @@ import attr import pytest +from fgpyo.util.inspect import FieldType from fgpyo.util.inspect import is_attr_class from fgpyo.util.inspect import is_dataclasses_class from fgpyo.util.metric import Metric @@ -121,6 +122,24 @@ class NameMetric(Metric["NameMetric"]): first: str last: str + @make_dataclass(use_attr=use_attr) + class LastFirstMetric(Metric["LastFirstMetric"]): + first: str + last: str + + @classmethod + def _header(cls, field_types: Optional[List[FieldType]] = None) -> List[str]: + return ["last", "first"] + + @make_dataclass(use_attr=use_attr) + class SubsetMetric(Metric["SubsetMetric"]): + written: str + hidden: Optional[str] + + @classmethod + def _header(cls, field_types: Optional[List[FieldType]] = None) -> List[str]: + return ["written"] + @make_dataclass(use_attr=use_attr) class NamedPerson(Metric["NamedPerson"]): name: Name @@ -156,6 +175,8 @@ class ListPerson(Metric["ListPerson"]): self.Person = Person self.Name = Name self.NameMetric = NameMetric + self.LastFirstMetric = LastFirstMetric + self.SubsetMetric = SubsetMetric self.NamedPerson = NamedPerson self.PersonMaybeAge = PersonMaybeAge self.PersonDefault = PersonDefault @@ -234,6 +255,10 @@ def test_is_correct_dataclass_type(use_attr: bool) -> None: assert is_dataclasses_class(data_and_classes.Name) is not use_attr assert is_attr_class(data_and_classes.NameMetric) is use_attr assert is_dataclasses_class(data_and_classes.NameMetric) is not use_attr + assert is_attr_class(data_and_classes.LastFirstMetric) is use_attr + assert is_dataclasses_class(data_and_classes.LastFirstMetric) is not use_attr + assert is_attr_class(data_and_classes.SubsetMetric) is use_attr + assert is_dataclasses_class(data_and_classes.SubsetMetric) is not use_attr assert is_attr_class(data_and_classes.NamedPerson) is use_attr assert is_dataclasses_class(data_and_classes.NamedPerson) is not use_attr assert is_attr_class(data_and_classes.PersonMaybeAge) is use_attr @@ -368,6 +393,54 @@ def test_metric_read_missing_column_with_default( list(PersonDefault.read(path=path)) +@pytest.mark.parametrize("data_and_classes", (attr_data_and_classes, dataclasses_data_and_classes)) +def test_metric_read_different_column_order(tmp_path: Path, data_and_classes: DataBuilder) -> None: + NameMetric: TypeAlias = data_and_classes.NameMetric + name = NameMetric(first="Jane", last="Doe") + path = tmp_path / "metrics.txt" + + with path.open("w") as writer: + writer.write("last\tfirst\n") + writer.write("Doe\tJane\n") + assert list(NameMetric.read(path=path)) == [name] + + +@pytest.mark.parametrize("data_and_classes", (attr_data_and_classes, dataclasses_data_and_classes)) +def test_metric_write_different_column_order(tmp_path: Path, data_and_classes: DataBuilder) -> None: + LastFirstMetric: TypeAlias = data_and_classes.LastFirstMetric + name = LastFirstMetric(first="Jane", last="Doe") + + assert LastFirstMetric.header() == ["last", "first"] + assert list(name.values()) == ["Doe", "Jane"] + + path = tmp_path / "metrics.txt" + LastFirstMetric.write(path, name) + with path.open("r") as read: + header = read.readline().strip().split("\t") + fields = read.readline().strip().split("\t") + + assert header == ["last", "first"] + assert fields == [name.last, name.first] + + +@pytest.mark.parametrize("data_and_classes", (attr_data_and_classes, dataclasses_data_and_classes)) +def test_metric_write_subset(tmp_path: Path, data_and_classes: DataBuilder) -> None: + SubsetMetric: TypeAlias = data_and_classes.SubsetMetric + metric = SubsetMetric(written="present", hidden="absent") + + assert SubsetMetric.header() == ["written"] + assert list(metric.values()) == ["present"] + + path = tmp_path / "metrics.txt" + SubsetMetric.write(path, metric) + with path.open("r") as read: + header = read.readline().strip().split("\t") + fields = read.readline().strip().split("\t") + + assert header == ["written"] + assert fields == [metric.written] + + @pytest.mark.parametrize("data_and_classes", (attr_data_and_classes, dataclasses_data_and_classes)) def test_metric_header(data_and_classes: DataBuilder) -> None: assert data_and_classes.DummyMetric.header() == [