Merge pull request #6224 from Roasbeef/prometheus-latency-histograms

monitoring+cfg: add new option to export gRPC perf metrics
2025-01-18 05:13:36 +01:00 · 2022-02-16 16:38:51 -08:00 · 2022-02-16 16:38:51 -08:00 · 9a50cc95b7
commit 9a50cc95b7
parent 0aa0831619 1fd8a78537
4 changed files with 26 additions and 0 deletions
--- a/docs/release-notes/release-notes-0.15.0.md
+++ b/docs/release-notes/release-notes-0.15.0.md
@ -82,6 +82,11 @@
 * Improved instructions on [how to build lnd for mobile](https://github.com/lightningnetwork/lnd/pull/6085).
 * [Log force-close related messages on "info" level](https://github.com/lightningnetwork/lnd/pull/6124).

+## Monitoring
+
+A new [flag (`--prometheus.perfhistograms`) has been added to enable export of
+gRPC performance metrics (latency to process `GetInfo`, etc)](https://github.com/lightningnetwork/lnd/pull/6224).
+
 ## Code Health

 ### Code cleanup, refactor, typo fixes
@ -127,6 +132,7 @@
 * Liviu
 * mateuszmp
 * Naveen Srinivasan
+* Olaoluwa Osuntokun
 * randymcmillan
 * Rong Ou
 * Thebora Kompanioni
--- a/lncfg/monitoring_on.go
+++ b/lncfg/monitoring_on.go
@ -13,6 +13,12 @@ type Prometheus struct {
 	// Enable indicates whether to export lnd gRPC performance metrics to
 	// Prometheus. Default is false.
 	Enable bool `long:"enable" description:"enable Prometheus exporting of lnd gRPC performance metrics."`
+
+	// PerfHistograms indicates if the additional histogram information for
+	// latency, and handling time of gRPC calls should be enabled. This
+	// generates additional data, and consume more memory for the
+	// Prometheus server.
+	PerfHistograms bool `long:"perfhistograms" description:"enable additional histogram to track gRPC call processing performance (latency, etc)"`
 }

 // DefaultPrometheus is the default configuration for the Prometheus metrics
--- a/monitoring/monitoring_on.go
+++ b/monitoring/monitoring_on.go
@ -35,6 +35,14 @@ func ExportPrometheusMetrics(grpcServer *grpc.Server, cfg lncfg.Prometheus) erro

 		grpc_prometheus.Register(grpcServer)

+		// Enable the histograms which can allow plotting latency
+		// distributions of inbound calls. However we guard this behind
+		// another flag as this can generate a lot of additional data,
+		// as its a high cardinality metric typically.
+		if cfg.PerfHistograms {
+			grpc_prometheus.EnableHandlingTimeHistogram()
+		}
+
 		http.Handle("/metrics", promhttp.Handler())
 		go func() {
 			http.ListenAndServe(cfg.Listen, nil)
--- a/sample-lnd.conf
+++ b/sample-lnd.conf
@ -421,6 +421,12 @@
 ; Specify the interface to listen on for Prometheus connections.
 ; prometheus.listen=0.0.0.0:8989

+; If true, then we'll export additional information that allows users to plot
+; the processing latency, and total time spent across each RPC calls+service.
+; This generates additional memory load for the Prometheus server, and will end
+; up using more disk space over time.
+; prometheus.perfhistograms=true
+
 ; The alias your node will use, which can be up to 32 UTF-8 characters in
 ; length.
 ; alias=My Lightning ☇