AuthorizedProjectsWorker, AuthorizedProjectUpdate::UserRefreshWithLowUrgencyWorker is issuing many hotspot recursive CTE namespace queries
This morning GitLab.com alerting fired with an SLO alert for the rails_sql
SLI:
https://dashboards.gitlab.net/d/patroni-main/patroni-overview?orgId=1
There were also SLO alerts for Sidekiq at the same time:
Looking at the postgres slowlog, and pg_stat_statements
metrics, we can see that the server is dominated by requests for queryid -7232084447659837857
.
This query:
WITH RECURSIVE "namespaces_cte" AS ((
SELECT
"namespaces"."id",
"members"."access_level"
FROM
"namespaces"
INNER JOIN "members" ON "namespaces"."id" = "members"."source_id"
WHERE
"members"."type" = $1
AND "members"."source_type" = $2
AND "namespaces"."type" = $3
AND "members"."user_id" = $4
AND "members"."requested_at" IS NULL
AND (access_level >= $5))
UNION (
SELECT
"namespaces"."id",
LEAST ("members"."access_level", "group_group_links"."group_access") AS access_level
FROM
"namespaces"
INNER JOIN "group_group_links" ON "group_group_links"."shared_group_id" = "namespaces"."id"
INNER JOIN "members" ON "group_group_links"."shared_with_group_id" = "members"."source_id"
AND "members"."source_type" = $6
AND "members"."requested_at" IS NULL
AND "members"."user_id" = $7
AND "members"."access_level" > $8
WHERE
"namespaces"."type" = $9)
UNION (
SELECT
"namespaces"."id",
GREATEST ("members"."access_level", "namespaces_cte"."access_level") AS access_level
FROM
"namespaces"
INNER JOIN "namespaces_cte" ON "namespaces_cte"."id" = "namespaces"."parent_id"
LEFT OUTER JOIN "members" ON "members"."source_id" = "namespaces"."id"
AND "members"."source_type" = $10
AND "members"."requested_at" IS NULL
AND "members"."user_id" = $11
AND "members"."access_level" > $12
WHERE
"namespaces"."type" = $13))
SELECT
"project_authorizations"."project_id",
MAX(access_level) AS access_level
FROM ((
SELECT
projects.id AS project_id,
members.access_level
FROM
"projects"
INNER JOIN "members" ON "projects"."id" = "members"."source_id"
WHERE
"members"."type" = $14
AND "members"."source_type" = $15
AND "members"."user_id" = $16
AND "members"."requested_at" IS NULL)
UNION (
SELECT
projects.id AS project_id,
$17 AS access_level
FROM
"projects"
INNER JOIN "namespaces" ON "projects"."namespace_id" = "namespaces"."id"
WHERE
"namespaces"."owner_id" = $18
AND "namespaces"."type" IS NULL)
UNION (
SELECT
"projects"."id" AS project_id,
"namespaces"."access_level"
FROM
"namespaces_cte" "namespaces"
INNER JOIN "projects" ON "projects"."namespace_id" = "namespaces"."id")
UNION (
SELECT
"project_group_links"."project_id",
LEAST ("namespaces"."access_level", "project_group_links"."group_access") AS access_level
FROM
"namespaces_cte" "namespaces"
INNER JOIN project_group_links ON project_group_links.group_id = namespaces.id
INNER JOIN projects ON projects.id = project_group_links.project_id
INNER JOIN namespaces p_ns ON p_ns.id = projects.namespace_id
WHERE (p_ns.share_with_group_lock IS FALSE))) project_authorizations
GROUP BY
"project_authorizations"."project_id"
Charting at the total time spent in statements, we can see this query dominating during the slowdowns:
From the slowlog we can deduce that the problem is AuthorizedProjectsWorker
.
Edited by Andrew Newdigate