diff --git a/pkg/controllers/federatedhpa/federatedhpa_controller.go b/pkg/controllers/federatedhpa/federatedhpa_controller.go index 5c0c1a79e0de..9cc03091f1de 100644 --- a/pkg/controllers/federatedhpa/federatedhpa_controller.go +++ b/pkg/controllers/federatedhpa/federatedhpa_controller.go @@ -35,6 +35,7 @@ import ( policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1" workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2" "github.com/karmada-io/karmada/pkg/controllers/federatedhpa/monitor" + "github.com/karmada-io/karmada/pkg/metrics" "github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag" "github.com/karmada-io/karmada/pkg/util" "github.com/karmada-io/karmada/pkg/util/fedinformer/typedmanager" @@ -155,7 +156,12 @@ func (c *FederatedHPAController) Reconcile(ctx context.Context, req controllerru } c.hpaSelectorsMux.Unlock() - err := c.reconcileAutoscaler(ctx, hpa) + // observe process FederatedHPA latency + var err error + startTime := time.Now() + defer metrics.ObserveProcessFederatedHPALatency(err, startTime) + + err = c.reconcileAutoscaler(ctx, hpa) if err != nil { return controllerruntime.Result{}, err } diff --git a/pkg/controllers/federatedhpa/metrics/client.go b/pkg/controllers/federatedhpa/metrics/client.go index a6b530b556a1..e4c043ec4d7d 100644 --- a/pkg/controllers/federatedhpa/metrics/client.go +++ b/pkg/controllers/federatedhpa/metrics/client.go @@ -32,6 +32,8 @@ import ( resourceclient "k8s.io/metrics/pkg/client/clientset/versioned/typed/metrics/v1beta1" customclient "k8s.io/metrics/pkg/client/custom_metrics" externalclient "k8s.io/metrics/pkg/client/external_metrics" + + "github.com/karmada-io/karmada/pkg/metrics" ) const ( @@ -64,6 +66,11 @@ type resourceMetricsClient struct { // GetResourceMetric gets the given resource metric (and an associated oldest timestamp) // for all pods matching the specified selector in the given namespace func (c *resourceMetricsClient) GetResourceMetric(ctx context.Context, resource corev1.ResourceName, namespace string, selector labels.Selector, container string) (PodMetricsInfo, time.Time, error) { + // observe pull ResourceMetric latency + var err error + startTime := time.Now() + defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "ResourceMetric", startTime) + metrics, err := c.client.PodMetricses(namespace).List(ctx, metav1.ListOptions{LabelSelector: selector.String()}) if err != nil { return nil, time.Time{}, fmt.Errorf("unable to fetch metrics from resource metrics API: %v", err) @@ -143,6 +150,11 @@ type customMetricsClient struct { // GetRawMetric gets the given metric (and an associated oldest timestamp) // for all pods matching the specified selector in the given namespace func (c *customMetricsClient) GetRawMetric(metricName string, namespace string, selector labels.Selector, metricSelector labels.Selector) (PodMetricsInfo, time.Time, error) { + // observe pull RawMetric latency + var err error + startTime := time.Now() + defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "RawMetric", startTime) + metrics, err := c.client.NamespacedMetrics(namespace).GetForObjects(schema.GroupKind{Kind: "Pod"}, selector, metricName, metricSelector) if err != nil { return nil, time.Time{}, fmt.Errorf("unable to fetch metrics from custom metrics API: %v", err) @@ -175,9 +187,13 @@ func (c *customMetricsClient) GetRawMetric(metricName string, namespace string, // GetObjectMetric gets the given metric (and an associated timestamp) for the given // object in the given namespace func (c *customMetricsClient) GetObjectMetric(metricName string, namespace string, objectRef *autoscalingv2.CrossVersionObjectReference, metricSelector labels.Selector) (int64, time.Time, error) { + // observe pull ObjectMetric latency + var err error + startTime := time.Now() + defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "ObjectMetric", startTime) + gvk := schema.FromAPIVersionAndKind(objectRef.APIVersion, objectRef.Kind) var metricValue *customapi.MetricValue - var err error if gvk.Kind == "Namespace" && gvk.Group == "" { // handle namespace separately // NB: we ignore namespace name here, since CrossVersionObjectReference isn't @@ -203,6 +219,11 @@ type externalMetricsClient struct { // GetExternalMetric gets all the values of a given external metric // that match the specified selector. func (c *externalMetricsClient) GetExternalMetric(metricName, namespace string, selector labels.Selector) ([]int64, time.Time, error) { + // observe pull ExternalMetric latency + var err error + startTime := time.Now() + defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "ExternalMetric", startTime) + metrics, err := c.client.NamespacedMetrics(namespace).List(metricName, selector) if err != nil { return []int64{}, time.Time{}, fmt.Errorf("unable to fetch metrics from external metrics API: %v", err) diff --git a/pkg/metrics/resource.go b/pkg/metrics/resource.go index 3555f019492d..9d15344cbf83 100644 --- a/pkg/metrics/resource.go +++ b/pkg/metrics/resource.go @@ -9,14 +9,16 @@ import ( ) const ( - resourceMatchPolicyDurationMetricsName = "resource_match_policy_duration_seconds" - resourceApplyPolicyDurationMetricsName = "resource_apply_policy_duration_seconds" - policyApplyAttemptsMetricsName = "policy_apply_attempts_total" - syncWorkDurationMetricsName = "binding_sync_work_duration_seconds" - syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds" - policyPreemptionMetricsName = "policy_preemption_total" - cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds" - cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds" + resourceMatchPolicyDurationMetricsName = "resource_match_policy_duration_seconds" + resourceApplyPolicyDurationMetricsName = "resource_apply_policy_duration_seconds" + policyApplyAttemptsMetricsName = "policy_apply_attempts_total" + syncWorkDurationMetricsName = "binding_sync_work_duration_seconds" + syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds" + policyPreemptionMetricsName = "policy_preemption_total" + cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds" + cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds" + federatedHPADurationMetricsName = "federatedhpa_process_duration_seconds" + federatedHPAPullMetricsDurationMetricsName = "federatedhpa_pull_metrics_duration_seconds" ) var ( @@ -65,6 +67,18 @@ var ( Help: "Duration in seconds to process a CronFederatedHPA rule. By the result, 'error' means a CronFederatedHPA rule failed to be processed. Otherwise 'success'.", Buckets: prometheus.ExponentialBuckets(0.001, 2, 12), }, []string{"result"}) + + federatedHPADurationHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: federatedHPADurationMetricsName, + Help: "Duration in seconds to process a FederatedHPA. By the result, 'error' means a FederatedHPA failed to be processed. Otherwise 'success'.", + Buckets: prometheus.ExponentialBuckets(0.01, 2, 12), + }, []string{"result"}) + + federatedHPAPullMetricsDurationHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: federatedHPAPullMetricsDurationMetricsName, + Help: "Duration in seconds taken by the FederatedHPA to pull metrics. By the result, 'error' means the FederatedHPA failed to pull the metrics. Otherwise 'success'.", + Buckets: prometheus.ExponentialBuckets(0.01, 2, 12), + }, []string{"result", "metricType"}) ) // ObserveFindMatchedPolicyLatency records the duration for the resource finding a matched policy. @@ -103,6 +117,16 @@ func ObserveProcessCronFederatedHPARuleLatency(err error, start time.Time) { cronFederatedHPARuleDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start)) } +// ObserveProcessFederatedHPALatency records the duration to process a FederatedHPA. +func ObserveProcessFederatedHPALatency(err error, start time.Time) { + federatedHPADurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start)) +} + +// ObserveFederatedHPAPullMetricsLatency records the duration it takes for the FederatedHPA to pull metrics. +func ObserveFederatedHPAPullMetricsLatency(err error, metricType string, start time.Time) { + federatedHPAPullMetricsDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err), metricType).Observe(utilmetrics.DurationInSeconds(start)) +} + // ResourceCollectors returns the collectors about resources. func ResourceCollectors() []prometheus.Collector { return []prometheus.Collector{ @@ -114,6 +138,8 @@ func ResourceCollectors() []prometheus.Collector { policyPreemptionCounter, cronFederatedHPADurationHistogram, cronFederatedHPARuleDurationHistogram, + federatedHPADurationHistogram, + federatedHPAPullMetricsDurationHistogram, } }