PrivateStorageio · exarkun · Nov 19, 2021 · Nov 19, 2021 · Nov 22, 2021 · Nov 22, 2021
diff --git a/setup.cfg b/setup.cfg
@@ -48,6 +48,7 @@ install_requires =
     tahoe-lafs @ https://github.com/tahoe-lafs/tahoe-lafs/archive/d3c6f58a8ded7db3324ef97c47f5c1921c3d58b7.zip
     treq
     pyutil
+    prometheus-client
 
 [options.extras_require]
 test = coverage; fixtures; testtools; hypothesis

diff --git a/src/_zkapauthorizer/_plugin.py b/src/_zkapauthorizer/_plugin.py
@@ -17,16 +17,26 @@
 Tahoe-LAFS.
 """
 
+from __future__ import absolute_import
+
 import random
 from datetime import datetime
 from functools import partial
 from weakref import WeakValueDictionary
 
+try:
+    from typing import Callable
+except ImportError:
+    pass
+
 import attr
 from allmydata.client import _Client
 from allmydata.interfaces import IAnnounceableStorageServer, IFoolscapStoragePlugin
 from allmydata.node import MissingConfigEntry
 from challenge_bypass_ristretto import SigningKey
+from eliot import start_action
+from prometheus_client import CollectorRegistry, write_to_textfile
+from twisted.internet import task
 from twisted.internet.defer import succeed
 from twisted.logger import Logger
 from twisted.python.filepath import FilePath
@@ -95,8 +105,23 @@ def _get_redeemer(self, node_config, announcement, reactor):
         """
         return get_redeemer(self.name, node_config, announcement, reactor)
 
-    def get_storage_server(self, configuration, get_anonymous_storage_server):
+    def get_storage_server(
+        self, configuration, get_anonymous_storage_server, reactor=None
+    ):
+        if reactor is None:
+            from twisted.internet import reactor
+        registry = CollectorRegistry()
         kwargs = configuration.copy()
+
+        # If metrics are desired, schedule their writing to disk.
+        metrics_interval = kwargs.pop(u"prometheus-metrics-interval", None)
+        metrics_path = kwargs.pop(u"prometheus-metrics-path", None)
+        if metrics_interval is not None and metrics_path is not None:
+            FilePath(metrics_path).parent().makedirs(ignoreExistingDirectory=True)
+            t = task.LoopingCall(make_safe_writer(metrics_path, registry))
+            t.clock = reactor
+            t.start(int(metrics_interval))
+
         root_url = kwargs.pop(u"ristretto-issuer-root-url")
         pass_value = int(kwargs.pop(u"pass-value", BYTES_PER_PASS))
         signing_key = load_signing_key(
@@ -111,6 +136,7 @@ def get_storage_server(self, configuration, get_anonymous_storage_server):
             get_anonymous_storage_server(),
             pass_value=pass_value,
             signing_key=signing_key,
+            registry=registry,
             **kwargs
         )
         return succeed(
@@ -158,6 +184,27 @@ def get_client_resource(self, node_config, reactor=None):
         )
 
 
+def make_safe_writer(metrics_path, registry):
+    # type: (str, CollectorRegistry) -> Callable[[], None]
+    """
+    Make a no-argument callable that writes metrics from the given registry to
+    the given path.  The callable will log errors writing to the path and not
+    raise exceptions.
+    """
+
+    def safe_writer():
+        try:
+            with start_action(
+                action_type=u"zkapauthorizer:metrics:write-to-textfile",
+                metrics_path=metrics_path,
+            ):
+                write_to_textfile(metrics_path, registry)
+        except Exception:
+            pass
+
+    return safe_writer
+
+
 _init_storage = _Client.__dict__["init_storage"]
 
 

diff --git a/src/_zkapauthorizer/_storage_server.py b/src/_zkapauthorizer/_storage_server.py
@@ -41,6 +41,7 @@
 from challenge_bypass_ristretto import SigningKey, TokenPreimage, VerificationSignature
 from eliot import log_call, start_action
 from foolscap.api import Referenceable
+from prometheus_client import CollectorRegistry, Histogram, Counter
 from twisted.internet.defer import Deferred
 from twisted.internet.interfaces import IReactorTime
 from twisted.python.filepath import FilePath
@@ -64,6 +65,19 @@
 except ImportError:
     pass
 
+# The last Python 2-supporting prometheus_client nevertheless tries to use
+# FileNotFoundError, an exception type from Python 3.  Since that release,
+# prometheus_client has dropped Python 2 support entirely so there is little
+# hope of ever having this fixed upstream.  When ZKAPAuthorizer is ported to
+# Python 3, this should no longer be necessary.
+def _prometheus_client_fix():
+    import prometheus_client.exposition
+
+    prometheus_client.exposition.FileNotFoundError = IOError
+
+
+_prometheus_client_fix()
+
 # See allmydata/storage/mutable.py
 SLOT_HEADER_SIZE = 468
 LEASE_TRAILER_SIZE = 4
@@ -135,6 +149,15 @@ def validate_passes(cls, message, passes, signing_key):
             signature_check_failed=signature_check_failed,
         )
 
+    def observe_error_metrics(self, error_metric):
+        # type: (Counter) -> None
+        """
+        Record any errors on the given metric object.
+        """
+        num_signature_errors = len(self.signature_check_failed)
+        if num_signature_errors > 0:
+            error_metric.labels("signature").inc(1)
+
     def raise_for(self, required_pass_count):
         """
         :raise MorePassesRequired: Always raised with fields populated from this
@@ -177,10 +200,74 @@ class ZKAPAuthorizerStorageServer(Referenceable):
     _original = attr.ib(validator=provides(RIStorageServer))
     _pass_value = pass_value_attribute()
     _signing_key = attr.ib(validator=instance_of(SigningKey))
+    _registry = attr.ib(
+        default=attr.Factory(CollectorRegistry),
+        validator=attr.validators.instance_of(CollectorRegistry),
+    )
     _clock = attr.ib(
         validator=provides(IReactorTime),
         default=attr.Factory(partial(namedAny, "twisted.internet.reactor")),
     )
+    # This histogram holds observations about the number of ZKAPs spent
+    # together on one operation.  Only ZKAPs for operations that succeed are
+    # accounted for. For example, if two immutable shares are uploaded
+    # together at a cost of 5 ZKAPs then the "5 ZKAPs" bucket observes one
+    # sample.
+    _metric_spending_successes = attr.ib(init=False)
+
+    # This counter holds observations about spending attempts that included
+    # ZKAPs without an acceptable signature.  For each spending attempt that
+    # includes any such ZKAPs, this counter is incremented.
+    _metric_spending_errors = attr.ib(init=False)
+
+    def _get_spending_histogram_buckets(self):
+        """
+        Create the upper bounds for the ZKAP spending histogram.
+        """
+        # We want a lot of small buckets to be able to get an idea of how much
+        # spending is for tiny files where our billing system doesn't work
+        # extremely well.  We also want some large buckets so we have a point
+        # of comparison - is there a lot more or less spending on big files
+        # than small files?  Prometheus recommends a metric have a maximum
+        # cardinality below 10
+        # (<https://prometheus.io/docs/practices/instrumentation/#do-not-overuse-labels>).
+        # Histograms are implemented with labels so the cardinality is equal
+        # to the number of buckets.  We will push this a little bit so we can
+        # span a better range.  The good news is that this is a static
+        # cardinality (it does not change based on the data observed) so we
+        # are not at risk of blowing up the metrics overhead unboundedly.  11
+        # finite buckets + 1 infinite bucket covers 1 to 1024 ZKAPs (plus
+        # infinity) and only needs 12 buckets.
+        return list(2 ** n for n in range(11)) + [float("inf")]
+
+    @_metric_spending_successes.default
+    def _make_success_histogram(self):
+        return Histogram(
+            "zkapauthorizer_server_spending_successes",
+            "ZKAP Spending Successes histogram",
+            registry=self._registry,
+            buckets=self._get_spending_histogram_buckets(),
+        )
+
+    @_metric_spending_errors.default
+    def _make_error_metric(self):
+        return Counter(
+            "zkapauthorizer_server_spending_errors",
+            "ZKAP Spending Errors",
+            labelnames=["signature"],
+            registry=self._registry,
+        )
+
+    def _clear_metrics(self):
+        """
+        Forget all recorded metrics.
+        """
+        # There is also a `clear` method it's for something else.  See
+        # https://github.com/prometheus/client_python/issues/707
+        self._metric_spending_successes._metric_init()
+
+        # It works on this one though.
+        self._metric_spending_errors.clear()
 
     def remote_get_version(self):
         """
@@ -209,6 +296,9 @@ def remote_allocate_buckets(
             self._signing_key,
         )
 
+        # Observe error metrics before blowing up the operation.
+        validation.observe_error_metrics(self._metric_spending_errors)
+
         # Note: The *allocate_buckets* protocol allows for some shares to
         # already exist on the server.  When this is the case, the cost of the
         # operation is based only on the shares which are really allocated
@@ -244,6 +334,22 @@ def remote_allocate_buckets(
             allocated_size,
             renew_leases=False,
         )
+
+        # We just committed to spending some of the presented passes.  If
+        # `alreadygot` is not empty then we didn't commit to spending *all* of
+        # them.  (Also, we didn't *accept* data for storage yet - but that's a
+        # defect in the spending protocol and metrics can't fix it so just
+        # ignore that for now.)
+        #
+        # This expression mirrors the expression the client uses to determine
+        # how many passes were spent when it processes the result we return to
+        # it.
+        spent_passes = required_passes(
+            self._pass_value,
+            [allocated_size] * len(bucketwriters),
+        )
+        self._metric_spending_successes.observe(spent_passes)
+
         # Copy/paste the disconnection handling logic from
         # StorageServer.remote_allocate_buckets.
         for bw in bucketwriters.values():
@@ -252,6 +358,7 @@ def remote_allocate_buckets(
                 canary,
                 disconnect_marker,
             )
+
         return alreadygot, bucketwriters
 
     def remote_get_buckets(self, storage_index):
@@ -271,13 +378,18 @@ def remote_add_lease(self, passes, storage_index, *a, **kw):
             passes,
             self._signing_key,
         )
+        # Observe error metrics before blowing up the operation.
+        validation.observe_error_metrics(self._metric_spending_errors)
+
         check_pass_quantity_for_lease(
             self._pass_value,
             storage_index,
             validation,
             self._original,
         )
-        return self._original.remote_add_lease(storage_index, *a, **kw)
+        result = self._original.remote_add_lease(storage_index, *a, **kw)
+        self._metric_spending_successes.observe(len(validation.valid))
+        return result
 
     def remote_advise_corrupt_share(self, *a, **kw):
         """
@@ -367,6 +479,9 @@ def _slot_testv_and_readv_and_writev(
             self._signing_key,
         )
 
+        # Observe error metrics before blowing up the operation.
+        validation.observe_error_metrics(self._metric_spending_errors)
+
         # Inspect the operation to determine its price based on any
         # allocations.
         required_new_passes = get_writev_price(
@@ -406,6 +521,9 @@ def _slot_testv_and_readv_and_writev(
         # somewhat.
         add_leases_for_writev(self._original, storage_index, secrets, tw_vectors, now)
 
+        # The operation has fully succeeded.
+        self._metric_spending_successes.observe(required_new_passes)
+
         # Propagate the result of the operation.
         return result
 
@@ -442,6 +560,7 @@ def check_pass_quantity(pass_value, validation, share_sizes):
 def check_pass_quantity_for_lease(
     pass_value, storage_index, validation, storage_server
 ):
+    # type: (int, bytes, _ValidationResult, ZKAPAuthorizerStorageServer) -> Dict[int, int]
     """
     Check that the given number of passes is sufficient to add or renew a
     lease for one period for the given storage index.
@@ -453,16 +572,18 @@ def check_pass_quantity_for_lease(
     :raise MorePassesRequired: If the given number of passes is too few for
         the share sizes at the given storage index.
 
-    :return: ``None`` if the given number of passes is sufficient.
+    :return: A mapping from share number to share size on the server if the
+        number of passes given is sufficient.
     """
     allocated_sizes = dict(
         get_share_sizes(
             storage_server,
             storage_index,
             list(get_all_share_numbers(storage_server, storage_index)),
         ),
-    ).values()
-    check_pass_quantity(pass_value, validation, allocated_sizes)
+    )
+    check_pass_quantity(pass_value, validation, allocated_sizes.values())
+    return allocated_sizes
 
 
 def check_pass_quantity_for_write(pass_value, validation, sharenums, allocated_size):

diff --git a/src/_zkapauthorizer/storage_common.py b/src/_zkapauthorizer/storage_common.py
@@ -155,7 +155,7 @@ def required_passes(bytes_per_pass, share_sizes):
             ),
         )
     result, b = divmod(sum(share_sizes, 0), bytes_per_pass)
-    if b:
+    if b > 0:
         result += 1
 
     # print("required_passes({}, {}) == {}".format(bytes_per_pass, share_sizes, result))