Skip to content

Fix plan checks for mirror queries

Sean McGivern requested to merge fix-remote-mirrors-exporter into master

Plans are no longer through namespace.plan_id, but through the root namespace via gitlab_subscriptions. (See gitlab-org/gitlab#196862 (closed) for more details on this being confusing.)

This query matches the one in gitlab-org/gitlab!27796 (merged), which is used for the actual mirroring we do 🙂

I tested the mirror queries on a production DB:

require 'gitlab_exporter'
require 'gitlab_exporter/database/row_count'

collector = GitLab::Exporter::Database::RowCountCollector.new(connection_string: 'host=localhost')

GitLab::Exporter::Database::RowCountCollector::QUERIES.each do |key, query|
  puts ''
  puts '-' * 70
  puts ''
  puts key
  puts collector.send(:construct_query, query)
end

And got these queries and results:

gitlabhq_production=> SELECT COUNT(*) FROM projects INNER JOIN project_mirror_data ON project_mirror_data.project_id = projects.id
gitlabhq_production-> INNER JOIN namespaces AS root_namespaces ON root_namespaces.id = (
gitlabhq_production(>   WITH RECURSIVE "base_and_ancestors" AS (
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces" WHERE "namespaces"."id" = projects.namespace_id)
gitlabhq_production(>     UNION
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces", "base_and_ancestors" WHERE "namespaces"."id" = "base_and_ancestors"."parent_id")
gitlabhq_production(>   ) SELECT "namespaces".id FROM "base_and_ancestors" AS "namespaces" WHERE "namespaces"."parent_id" IS NULL
gitlabhq_production(> )
gitlabhq_production-> LEFT JOIN gitlab_subscriptions ON gitlab_subscriptions.namespace_id = root_namespaces.id
gitlabhq_production-> LEFT JOIN plans ON plans.id = gitlab_subscriptions.hosted_plan_id
gitlabhq_production->  WHERE projects.mirror = true
gitlabhq_production-> AND project_mirror_data.retry_count <= 14
gitlabhq_production-> AND (projects.visibility_level = 20 OR plans.name IN ('early_adopter', 'bronze', 'silver', 'gold'))
gitlabhq_production->
gitlabhq_production-> AND project_mirror_data.status NOT IN ('scheduled', 'started')
gitlabhq_production-> AND project_mirror_data.next_execution_timestamp <= NOW()
gitlabhq_production-> ;
 count
-------
   545
(1 row)

gitlabhq_production=> SELECT COUNT(*) FROM projects INNER JOIN project_mirror_data ON project_mirror_data.project_id = projects.id
gitlabhq_production-> INNER JOIN namespaces AS root_namespaces ON root_namespaces.id = (
gitlabhq_production(>   WITH RECURSIVE "base_and_ancestors" AS (
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces" WHERE "namespaces"."id" = projects.namespace_id)
gitlabhq_production(>     UNION
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces", "base_and_ancestors" WHERE "namespaces"."id" = "base_and_ancestors"."parent_id")
gitlabhq_production(>   ) SELECT "namespaces".id FROM "base_and_ancestors" AS "namespaces" WHERE "namespaces"."parent_id" IS NULL
gitlabhq_production(> )
gitlabhq_production-> LEFT JOIN gitlab_subscriptions ON gitlab_subscriptions.namespace_id = root_namespaces.id
gitlabhq_production-> LEFT JOIN plans ON plans.id = gitlab_subscriptions.hosted_plan_id
gitlabhq_production->  WHERE projects.mirror = true
gitlabhq_production-> AND project_mirror_data.retry_count <= 14
gitlabhq_production-> AND (projects.visibility_level = 20 OR plans.name IN ('early_adopter', 'bronze', 'silver', 'gold'))
gitlabhq_production->
gitlabhq_production-> AND project_mirror_data.status NOT IN ('scheduled', 'started')
gitlabhq_production-> AND (project_mirror_data.next_execution_timestamp - project_mirror_data.last_update_at) <= '30 minutes'::interval
gitlabhq_production-> AND project_mirror_data.last_update_at < NOW() - '30 minutes'::interval
gitlabhq_production-> ;
 count
-------
     1
(1 row)

gitlabhq_production=> SELECT COUNT(*) FROM projects INNER JOIN project_mirror_data ON project_mirror_data.project_id = projects.id
gitlabhq_production-> INNER JOIN namespaces AS root_namespaces ON root_namespaces.id = (
gitlabhq_production(>   WITH RECURSIVE "base_and_ancestors" AS (
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces" WHERE "namespaces"."id" = projects.namespace_id)
gitlabhq_production(>     UNION
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces", "base_and_ancestors" WHERE "namespaces"."id" = "base_and_ancestors"."parent_id")
gitlabhq_production(>   ) SELECT "namespaces".id FROM "base_and_ancestors" AS "namespaces" WHERE "namespaces"."parent_id" IS NULL
gitlabhq_production(> )
gitlabhq_production-> LEFT JOIN gitlab_subscriptions ON gitlab_subscriptions.namespace_id = root_namespaces.id
gitlabhq_production-> LEFT JOIN plans ON plans.id = gitlab_subscriptions.hosted_plan_id
gitlabhq_production->  WHERE projects.mirror = true
gitlabhq_production-> AND project_mirror_data.retry_count <= 14
gitlabhq_production-> AND (projects.visibility_level = 20 OR plans.name IN ('early_adopter', 'bronze', 'silver', 'gold'))
gitlabhq_production->
gitlabhq_production-> AND project_mirror_data.status NOT IN ('scheduled', 'started')
gitlabhq_production-> AND project_mirror_data.last_update_at >= NOW() - '30 seconds'::interval
gitlabhq_production-> ;
 count
-------
   888
(1 row)

gitlabhq_production=> SELECT COUNT(*) FROM projects INNER JOIN project_mirror_data ON project_mirror_data.project_id = projects.id
gitlabhq_production-> INNER JOIN namespaces AS root_namespaces ON root_namespaces.id = (
gitlabhq_production(>   WITH RECURSIVE "base_and_ancestors" AS (
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces" WHERE "namespaces"."id" = projects.namespace_id)
gitlabhq_production(>     UNION
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces", "base_and_ancestors" WHERE "namespaces"."id" = "base_and_ancestors"."parent_id")
gitlabhq_production(>   ) SELECT "namespaces".id FROM "base_and_ancestors" AS "namespaces" WHERE "namespaces"."parent_id" IS NULL
gitlabhq_production(> )
gitlabhq_production-> LEFT JOIN gitlab_subscriptions ON gitlab_subscriptions.namespace_id = root_namespaces.id
gitlabhq_production-> LEFT JOIN plans ON plans.id = gitlab_subscriptions.hosted_plan_id
gitlabhq_production->  WHERE projects.mirror = true
gitlabhq_production-> AND project_mirror_data.retry_count <= 14
gitlabhq_production-> AND (projects.visibility_level = 20 OR plans.name IN ('early_adopter', 'bronze', 'silver', 'gold'))
gitlabhq_production->
gitlabhq_production-> AND project_mirror_data.status NOT IN ('scheduled', 'started')
gitlabhq_production-> AND project_mirror_data.next_execution_timestamp <= NOW() - '10 seconds'::interval
gitlabhq_production-> ;
 count
-------
   361
(1 row)

gitlabhq_production=> SELECT COUNT(*) FROM projects INNER JOIN project_mirror_data ON project_mirror_data.project_id = projects.id
gitlabhq_production-> INNER JOIN namespaces AS root_namespaces ON root_namespaces.id = (
gitlabhq_production(>   WITH RECURSIVE "base_and_ancestors" AS (
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces" WHERE "namespaces"."id" = projects.namespace_id)
gitlabhq_production(>     UNION
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces", "base_and_ancestors" WHERE "namespaces"."id" = "base_and_ancestors"."parent_id")
gitlabhq_production(>   ) SELECT "namespaces".id FROM "base_and_ancestors" AS "namespaces" WHERE "namespaces"."parent_id" IS NULL
gitlabhq_production(> )
gitlabhq_production-> LEFT JOIN gitlab_subscriptions ON gitlab_subscriptions.namespace_id = root_namespaces.id
gitlabhq_production-> LEFT JOIN plans ON plans.id = gitlab_subscriptions.hosted_plan_id
gitlabhq_production->  WHERE projects.mirror = true
gitlabhq_production-> AND project_mirror_data.retry_count <= 14
gitlabhq_production-> AND (projects.visibility_level = 20 OR plans.name IN ('early_adopter', 'bronze', 'silver', 'gold'))
gitlabhq_production->
gitlabhq_production-> AND project_mirror_data.status IN ('scheduled', 'started')
gitlabhq_production-> ;
 count
-------
    25
(1 row)

gitlabhq_production=> SELECT COUNT(*) FROM projects INNER JOIN project_mirror_data ON project_mirror_data.project_id = projects.id
gitlabhq_production-> INNER JOIN namespaces AS root_namespaces ON root_namespaces.id = (
gitlabhq_production(>   WITH RECURSIVE "base_and_ancestors" AS (
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces" WHERE "namespaces"."id" = projects.namespace_id)
gitlabhq_production(>     UNION
gitlabhq_production(>     (SELECT "namespaces".* FROM "namespaces", "base_and_ancestors" WHERE "namespaces"."id" = "base_and_ancestors"."parent_id")
gitlabhq_production(>   ) SELECT "namespaces".id FROM "base_and_ancestors" AS "namespaces" WHERE "namespaces"."parent_id" IS NULL
gitlabhq_production(> )
gitlabhq_production-> LEFT JOIN gitlab_subscriptions ON gitlab_subscriptions.namespace_id = root_namespaces.id
gitlabhq_production-> LEFT JOIN plans ON plans.id = gitlab_subscriptions.hosted_plan_id
gitlabhq_production->  WHERE projects.mirror = true
gitlabhq_production-> AND project_mirror_data.retry_count <= 14
gitlabhq_production-> AND (projects.visibility_level = 20 OR plans.name IN ('early_adopter', 'bronze', 'silver', 'gold'))
gitlabhq_production->
gitlabhq_production-> AND project_mirror_data.status = 'scheduled'
gitlabhq_production-> ;
 count
-------
     0
(1 row)

This query still isn't perfect: we have some data integrity issues where a project is public but its root namespace isn't (so the project isn't actually public and doesn't get mirroring for free), but this broadly matches what the application is doing 🙂

For gitlab-com/gl-infra/scalability#233 (closed).

Edited by Sean McGivern

Merge request reports