Skip to content
Snippets Groups Projects
Commit e53840b2 authored by John Cai's avatar John Cai Committed by Patrick Steinhardt
Browse files

praefect: Add ability to have separate database metrics endpoint

By default, when metrics are enabled, then each Praefect will expose
information about how many read-only repositories there are, which
requires Praefect to query the database. First, this will result in the
same metrics being exposed by every Praefect given that the database is
shared between all of them. And second, this will cause one query per
Praefect per scraping run. This cost does add up and generate quite some
load on the database, especially so if there is a lot of repositories in
that database, up to a point where it may overload the database
completely.

Fix this issue by splitting metrics which hit the database into a
separate endpoint "/db_metrics". This allows admins to set up a separate
scraper with a different scraping interval for this metric, and
furthermore it gives the ability to only scrape this metric for one of
the Praefect instances so the work isn't unnecessarily duplicated.

Given that this is a breaking change which will get backported, we must
make this behaviour opt-in for now. We thus include a new configuration
key "prometheus_use_database_endpoint" which enables the new behaviour
such that existing installations' metrics won't break on a simple point
release. The intent is to eventually remove this configuration though
and enable it for all setups on a major release.

Changelog: added
(cherry picked from commit 7e74b733)
parent 35844da3
No related branches found
No related tags found
Loading
This commit is part of merge request !4096. Comments created here will be created in the context of that merge request.
......@@ -63,11 +63,13 @@ import (
"flag"
"fmt"
"math/rand"
"net/http"
"os"
"strings"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/sirupsen/logrus"
"gitlab.com/gitlab-org/gitaly/v14/internal/backchannel"
"gitlab.com/gitlab-org/gitaly/v14/internal/bootstrap"
......@@ -147,7 +149,9 @@ func main() {
logger.Fatalf("unable to create a bootstrap: %v", err)
}
if err := run(starterConfigs, conf, b, prometheus.DefaultRegisterer); err != nil {
dbPromRegistry := prometheus.NewRegistry()
if err := run(starterConfigs, conf, b, prometheus.DefaultRegisterer, dbPromRegistry); err != nil {
logger.Fatalf("%v", err)
}
}
......@@ -190,7 +194,16 @@ func configure(conf config.Config) {
sentry.ConfigureSentry(version.GetVersion(), conf.Sentry)
}
func run(cfgs []starter.Config, conf config.Config, b bootstrap.Listener, promreg prometheus.Registerer) error {
func run(
cfgs []starter.Config,
conf config.Config,
b bootstrap.Listener,
promreg prometheus.Registerer,
dbPromRegistry interface {
prometheus.Registerer
prometheus.Gatherer
},
) error {
nodeLatencyHistogram, err := metrics.RegisterNodeLatency(conf.Prometheus, promreg)
if err != nil {
return err
......@@ -380,9 +393,18 @@ func run(cfgs []starter.Config, conf config.Config, b bootstrap.Listener, promre
)
metricsCollectors = append(metricsCollectors, transactionManager, coordinator, repl)
if db != nil {
promreg.MustRegister(
datastore.NewRepositoryStoreCollector(logger, conf.VirtualStorageNames(), db, conf.Prometheus.ScrapeTimeout),
)
repositoryStoreCollector := datastore.NewRepositoryStoreCollector(logger, conf.VirtualStorageNames(), db, conf.Prometheus.ScrapeTimeout)
// Eventually, database-related metrics will always be exported via a separate
// endpoint such that it's possible to set a different scraping interval and thus to
// reduce database load. For now though, we register the metrics twice, once for the
// standard and once for the database-specific endpoint. This is done to ensure a
// transitory period where deployments can be moved to the new endpoint without
// causing breakage if they still use the old endpoint.
dbPromRegistry.MustRegister(repositoryStoreCollector)
if !conf.PrometheusExcludeDatabaseFromDefaultMetrics {
promreg.MustRegister(repositoryStoreCollector)
}
}
promreg.MustRegister(metricsCollectors...)
......@@ -403,9 +425,13 @@ func run(cfgs []starter.Config, conf config.Config, b bootstrap.Listener, promre
return err
}
serveMux := http.NewServeMux()
serveMux.Handle("/db_metrics", promhttp.HandlerFor(dbPromRegistry, promhttp.HandlerOpts{}))
go func() {
if err := monitoring.Start(
monitoring.WithListener(l),
monitoring.WithServeMux(serveMux),
monitoring.WithBuildInformation(praefect.GetVersion(), praefect.GetBuildTime())); err != nil {
logger.WithError(err).Errorf("Unable to start prometheus listener: %v", conf.PrometheusListenAddr)
}
......
......@@ -84,7 +84,7 @@ func TestListUntrackedRepositories_Exec(t *testing.T) {
require.NoError(t, err)
bootstrapper := bootstrap.NewNoop()
go func() {
assert.NoError(t, run(starterConfigs, conf, bootstrapper, prometheus.NewRegistry()))
assert.NoError(t, run(starterConfigs, conf, bootstrapper, prometheus.NewRegistry(), prometheus.NewRegistry()))
}()
cc, err := client.Dial("unix://"+conf.SocketPath, nil)
......
......@@ -104,7 +104,7 @@ func TestRemoveRepository_Exec(t *testing.T) {
require.NoError(t, err)
bootstrapper := bootstrap.NewNoop()
go func() {
assert.NoError(t, run(starterConfigs, conf, bootstrapper, prometheus.NewRegistry()))
assert.NoError(t, run(starterConfigs, conf, bootstrapper, prometheus.NewRegistry(), prometheus.NewRegistry()))
}()
cc, err := client.Dial("unix://"+conf.SocketPath, nil)
......
......@@ -117,10 +117,16 @@ type Config struct {
Sentry sentry.Config `toml:"sentry"`
PrometheusListenAddr string `toml:"prometheus_listen_addr"`
Prometheus prometheus.Config `toml:"prometheus"`
Auth auth.Config `toml:"auth"`
TLS config.TLS `toml:"tls"`
DB `toml:"database"`
Failover Failover `toml:"failover"`
// PrometheusExcludeDatabaseFromDefaultMetrics excludes database-related metrics from the
// default metrics. If set to `false`, then database metrics will be available both via
// `/metrics` and `/db_metrics`. Otherwise, they will only be accessible via `/db_metrics`.
// Defaults to `false`. This is used as a transitory configuration key: eventually, database
// metrics will always be removed from the standard metrics endpoint.
PrometheusExcludeDatabaseFromDefaultMetrics bool `toml:"prometheus_exclude_database_from_default_metrics"`
Auth auth.Config `toml:"auth"`
TLS config.TLS `toml:"tls"`
DB `toml:"database"`
Failover Failover `toml:"failover"`
// Keep for legacy reasons: remove after Omnibus has switched
FailoverEnabled bool `toml:"failover_enabled"`
MemoryQueueEnabled bool `toml:"memory_queue_enabled"`
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment