From 8be357bd0857773167a2d84b78d5896b3ceeb51f Mon Sep 17 00:00:00 2001
From: Sami Hiltunen <shiltunen@gitlab.com>
Date: Wed, 17 Nov 2021 15:17:22 +0200
Subject: [PATCH 1/3] Materialize valid_primaries view in dataloss query

The dataloss query is extremely slow for bigger datasets. The problem
is that for each row that the data loss query is returning,
Postgres computes the full result of the valid_primaries view only to
filter down to the correct record. This results in an o(n2) complexity
which kills the performance as soon as the dataset size increases. It's
not clear why the join parameters are not pushed down in to the view in
the query.

This commit optimizes the query by materializing the valid_primaries view.
This ensures Postgres computes the full view only once and joins with the
pre-computed result.

Changelog: performance
---
 internal/praefect/datastore/repository_store.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/internal/praefect/datastore/repository_store.go b/internal/praefect/datastore/repository_store.go
index 9c8dcf30e17..270c0b2789c 100644
--- a/internal/praefect/datastore/repository_store.go
+++ b/internal/praefect/datastore/repository_store.go
@@ -651,6 +651,10 @@ func (rs *PostgresRepositoryStore) GetPartiallyAvailableRepositories(ctx context
 	//    than the assigned ones.
 	//
 	rows, err := rs.db.QueryContext(ctx, `
+WITH valid_primaries AS MATERIALIZED (
+	SELECT * FROM valid_primaries
+)
+
 SELECT
 	json_build_object (
 		'RelativePath', relative_path,
-- 
GitLab


From 4372302d5a67def30e76a4d30440a4478a22fd91 Mon Sep 17 00:00:00 2001
From: Sami Hiltunen <shiltunen@gitlab.com>
Date: Wed, 17 Nov 2021 15:28:24 +0200
Subject: [PATCH 2/3] Get the latest generation from repositories instead of a
 view

Dataloss query is currently getting the latest generation of a repository
from a view that takes the max generation from storage_repositories. This
is unnecessary as the repositories table already contains the latest generation
and we can take it from there instead. This commit reads it from the repositories
table instead.

Changelog: performance
---
 internal/praefect/datastore/repository_store.go     |  3 +--
 .../praefect/datastore/repository_store_test.go     | 13 ++++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/internal/praefect/datastore/repository_store.go b/internal/praefect/datastore/repository_store.go
index 270c0b2789c..3d68ea7dea7 100644
--- a/internal/praefect/datastore/repository_store.go
+++ b/internal/praefect/datastore/repository_store.go
@@ -674,7 +674,7 @@ FROM (
 		relative_path,
 		repositories.primary,
 		storage,
-		repository_generations.generation - COALESCE(storage_repositories.generation, -1) AS behind_by,
+		repositories.generation - COALESCE(storage_repositories.generation, -1) AS behind_by,
 		repository_assignments.storage IS NOT NULL AS assigned,
 		healthy_storages.storage IS NOT NULL AS healthy,
 		valid_primaries.storage IS NOT NULL AS valid_primary
@@ -692,7 +692,6 @@ FROM (
 		)
 	) AS repository_assignments USING (virtual_storage, relative_path, storage)
 	JOIN repositories USING (virtual_storage, relative_path)
-	JOIN repository_generations USING (virtual_storage, relative_path)
 	LEFT JOIN healthy_storages USING (virtual_storage, storage)
 	LEFT JOIN valid_primaries USING (virtual_storage, relative_path, storage)
 	WHERE virtual_storage = $1
diff --git a/internal/praefect/datastore/repository_store_test.go b/internal/praefect/datastore/repository_store_test.go
index f8bd2085d06..a38b0875d9b 100644
--- a/internal/praefect/datastore/repository_store_test.go
+++ b/internal/praefect/datastore/repository_store_test.go
@@ -1321,10 +1321,17 @@ func TestPostgresRepositoryStore_GetPartiallyAvailableRepositories(t *testing.T)
 			})
 
 			if !tc.nonExistentRepository {
+				var maxGeneration int
+				for _, generation := range tc.existingGenerations {
+					if generation > maxGeneration {
+						maxGeneration = generation
+					}
+				}
+
 				_, err := tx.ExecContext(ctx, `
-							INSERT INTO repositories (virtual_storage, relative_path, "primary")
-							VALUES ('virtual-storage', 'relative-path', 'repository-primary')
-						`)
+							INSERT INTO repositories (virtual_storage, relative_path, "primary", generation)
+							VALUES ('virtual-storage', 'relative-path', 'repository-primary', $1)
+						`, maxGeneration)
 				require.NoError(t, err)
 			}
 
-- 
GitLab


From 853ae2510e8151ac304cc4ddafb9873f42de4133 Mon Sep 17 00:00:00 2001
From: Sami Hiltunen <shiltunen@gitlab.com>
Date: Wed, 17 Nov 2021 18:33:54 +0200
Subject: [PATCH 3/3] Materialize valid_primaries view in
 RepositoryStoreCollector

RepositoryStoreCollector gathers metrics on repositories which don't
have a valid primary candidates available. This indicates the repository
is unavailable as the current primary is not valid and ther are no valid
candidates to failover to. The query is currently extremely inefficient
on some versions of Postgres as it ends up computing the full valid_primaries
view for each of the rows it checks. This doesn't seem to occur on all versions
of Postgres, namely 12.6 at least manages to push down the search criteria
inside the view. This commit fixes the situation by materializing the
valid_primaries view prior to querying it. This ensures the full view isn't
computed for all of the rows but rather Postgres just uses the pre-computed
result.

Changelog: performance
---
 internal/praefect/datastore/collector.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/internal/praefect/datastore/collector.go b/internal/praefect/datastore/collector.go
index 15bc463947c..71f145e67da 100644
--- a/internal/praefect/datastore/collector.go
+++ b/internal/praefect/datastore/collector.go
@@ -72,6 +72,11 @@ func (c *RepositoryStoreCollector) Collect(ch chan<- prometheus.Metric) {
 // they are either unhealthy or out of date.
 func (c *RepositoryStoreCollector) queryMetrics(ctx context.Context) (map[string]int, error) {
 	rows, err := c.db.QueryContext(ctx, `
+WITH valid_primaries AS MATERIALIZED (
+	SELECT virtual_storage, relative_path
+	FROM valid_primaries
+)
+
 SELECT virtual_storage, COUNT(*)
 FROM repositories
 WHERE NOT EXISTS (
-- 
GitLab