Skip to content

Commit

Permalink
add cluster_name label to cluster metrics (#61)
Browse files Browse the repository at this point in the history
  • Loading branch information
wildum committed Sep 13, 2024
1 parent e2e8e7d commit 0ee98ba
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 4 deletions.
33 changes: 31 additions & 2 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ const (
eventNodeConflict = "node_conflict"
)

const clusterNameLabel = "cluster_name"

// metrics holds the set of metrics for a Node. Additional Collectors can be
// registered by calling Add.
type metrics struct {
Expand All @@ -33,38 +35,56 @@ type metrics struct {

var _ prometheus.Collector = (*metrics)(nil)

func newMetrics() *metrics {
func newMetrics(clusterName string) *metrics {
var m metrics

m.gossipEventsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "cluster_node_gossip_received_events_total",
Help: "Total number of gossip messages handled by the node.",
ConstLabels: prometheus.Labels{
clusterNameLabel: clusterName,
},
}, []string{"event"})

m.nodePeers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cluster_node_peers",
Help: "Current number of healthy peers by state",
ConstLabels: prometheus.Labels{
clusterNameLabel: clusterName,
},
}, []string{"state"})

m.nodeUpdating = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "cluster_node_updating",
Help: "1 if the node is currently processing a change to the cluster state.",
ConstLabels: prometheus.Labels{
clusterNameLabel: clusterName,
},
})

m.nodeUpdateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "cluster_node_update_duration_seconds",
Help: "Histogram of the latency it took to process a change to the cluster state.",
Buckets: prometheus.DefBuckets,
ConstLabels: prometheus.Labels{
clusterNameLabel: clusterName,
},
})

m.nodeObservers = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "cluster_node_update_observers",
Help: "Number of internal observers waiting for changes to cluster state.",
ConstLabels: prometheus.Labels{
clusterNameLabel: clusterName,
},
})

m.nodeInfo = metricsutil.NewInfoCollector(metricsutil.InfoOpts{
Name: "cluster_node_info",
Help: "Info about the local node. Label values will change as the node changes state.",
ConstLabels: prometheus.Labels{
clusterNameLabel: clusterName,
},
}, "state")

m.Add(
Expand All @@ -79,12 +99,15 @@ func newMetrics() *metrics {
return &m
}

func newMemberlistCollector(ml *memberlist.Memberlist) prometheus.Collector {
func newMemberlistCollector(ml *memberlist.Memberlist, clusterName string) prometheus.Collector {
var container metricsutil.Container

gossipProtoVersion := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "cluster_node_gossip_proto_version",
Help: "Gossip protocol version used by nodes to maintain the cluster",
ConstLabels: prometheus.Labels{
clusterNameLabel: clusterName,
},
}, func() float64 {
// NOTE(rfratto): while this is static at the time of writing, the internal
// documentation for memberlist claims that ProtocolVersion may one day be
Expand All @@ -95,13 +118,19 @@ func newMemberlistCollector(ml *memberlist.Memberlist) prometheus.Collector {
gossipHealthScore := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "cluster_node_gossip_health_score",
Help: "Health value of a node; lower values means healthier. 0 is the minimum.",
ConstLabels: prometheus.Labels{
clusterNameLabel: clusterName,
},
}, func() float64 {
return float64(ml.GetHealthScore())
})

gossipPeers := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "cluster_node_gossip_alive_peers",
Help: "How many alive gossip peers a node has, including the local node.",
ConstLabels: prometheus.Labels{
clusterNameLabel: clusterName,
},
}, func() float64 {
return float64(ml.NumMembers())
})
Expand Down
4 changes: 2 additions & 2 deletions node.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ func NewNode(cli *http.Client, cfg Config) (*Node, error) {
n := &Node{
log: cfg.Log,
cfg: cfg,
m: newMetrics(),
m: newMetrics(mlc.Label),

notifyObserversQueue: queue.New(1),

Expand All @@ -209,7 +209,7 @@ func NewNode(cli *http.Client, cfg Config) (*Node, error) {

// Include some extra metrics.
n.m.Add(
newMemberlistCollector(ml),
newMemberlistCollector(ml, mlc.Label),
transportMetrics,
prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "cluster_node_lamport_time",
Expand Down

0 comments on commit 0ee98ba

Please sign in to comment.