Skip to content

Remove occurrence and project counts

mo khan requested to merge mokhax/422087/two-phase-query into master

What does this MR do and why?

This MR attempts to optimize the group.sbom_occurrences query by removing the project_count and occurrence_count aggregations. These aggregations make it difficult to move towards using efficient IN operator queries. I would like to propose that we render each detected dependency as a separate row in the UI. I think that this is beneficial for a performance perspective but it also helps with the data export functionality because the export would include each unique detected dependency instead of a single row for each dependency and the # of projects that they were detected in.

To accomplish this I need to change the UI to be able to render each unique occurrence detected instead of aggregating them by component/version. See the screenshots below.

Before:

WITH "our_occurrences" AS MATERIALIZED (
  SELECT "sbom_occurrences".*
  FROM "sbom_occurrences"
  WHERE "sbom_occurrences"."project_id" IN (
    SELECT "projects"."id"
    FROM "projects"
    WHERE "projects"."namespace_id" IN (
      SELECT namespaces.traversal_ids[array_length(namespaces.traversal_ids, 1)] AS id
      FROM "namespaces"
      WHERE "namespaces"."type" = 'Group'
      AND (traversal_ids @> ('{9970}'))
    )
    AND "projects"."marked_for_deletion_at" IS NULL
    AND "projects"."pending_delete" = FALSE
  )
)
SELECT sbom_occurrences.*, agg_occurrences.occurrence_count, agg_occurrences.project_count
FROM "sbom_occurrences"
INNER JOIN (
  SELECT component_id,
        COUNT(DISTINCT id) AS occurrence_count,
        COUNT(DISTINCT project_id) AS project_count
  FROM our_occurrences
  GROUP BY component_id
) agg_occurrences ON sbom_occurrences.component_id = agg_occurrences.component_id
WHERE "sbom_occurrences"."id" IN (
  SELECT "our_occurrences"."id"
  FROM "our_occurrences"
)
ORDER BY "sbom_occurrences"."id" ASC
LIMIT 20 OFFSET 0;
Time: 21.698 s
  - planning: 18.821 ms
  - execution: 21.679 s
    - I/O read: 20.094 s
    - I/O write: 0.000 ms

Shared buffers:
  - hits: 407930 (~3.10 GiB) from the buffer pool
  - reads: 44820 (~350.20 MiB) from the OS file cache, including disk I/O
  - dirtied: 0
  - writes: 0

https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/22806/commands/73586

After:

SELECT "sbom_occurrences".*
FROM "sbom_occurrences"
WHERE "sbom_occurrences"."project_id" IN (
  SELECT DISTINCT "projects"."id"
  FROM "projects"
  WHERE "projects"."namespace_id" IN (
    SELECT namespaces.traversal_ids[array_length(namespaces.traversal_ids, 1)] AS id
    FROM "namespaces"
    WHERE "namespaces"."type" = 'Group'
    AND (traversal_ids @> ('{9970}'))
  )
  AND "projects"."marked_for_deletion_at" IS NULL
  AND "projects"."pending_delete" = FALSE
) 
ORDER BY "sbom_occurrences"."id" ASC
LIMIT 20
OFFSET 0;
Time: 12.846 s
  - planning: 8.855 ms
  - execution: 12.837 s
    - I/O read: 12.612 s
    - I/O write: 0.000 ms

Shared buffers:
  - hits: 1620 (~12.70 MiB) from the buffer pool
  - reads: 4711 (~36.80 MiB) from the OS file cache, including disk I/O
  - dirtied: 190 (~1.50 MiB)
  - writes: 0

https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/23222/commands/74708

sort by package manager

SELECT "sbom_occurrences".*
FROM "sbom_occurrences"
WHERE "sbom_occurrences"."project_id" IN (
  SELECT "projects"."id"
  FROM "projects"
  WHERE "projects"."namespace_id" IN (
    SELECT namespaces.traversal_ids[array_length(namespaces.traversal_ids, 1)] AS id
    FROM "namespaces"
    WHERE "namespaces"."type" = 'Group'
    AND (traversal_ids @> ('{9970}'))
  )
  AND "projects"."marked_for_deletion_at" IS NULL
  AND "projects"."pending_delete" = FALSE
)
ORDER BY "sbom_occurrences"."package_manager" ASC
LIMIT 20 OFFSET 0

https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/22835/commands/73667

sort by component name

SELECT "sbom_occurrences".*
FROM "sbom_occurrences"
WHERE "sbom_occurrences"."project_id" IN (
  SELECT "projects"."id"
  FROM "projects"
  WHERE "projects"."namespace_id" IN (
    SELECT namespaces.traversal_ids[array_length(namespaces.traversal_ids, 1)] AS id
    FROM "namespaces"
    WHERE "namespaces"."type" = 'Group'
    AND (traversal_ids @> ('{9970}'))
  )
  AND "projects"."marked_for_deletion_at" IS NULL
  AND "projects"."pending_delete" = FALSE
)
ORDER BY "sbom_occurrences"."component_name" ASC
LIMIT 20 OFFSET 0

https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/22835/commands/73668

sort by license

SELECT "sbom_occurrences".*
FROM "sbom_occurrences"
WHERE "sbom_occurrences"."project_id" IN (
  SELECT "projects"."id"
  FROM "projects"
  WHERE "projects"."namespace_id" IN (
    SELECT namespaces.traversal_ids[array_length(namespaces.traversal_ids, 1)] AS id
    FROM "namespaces"
    WHERE "namespaces"."type" = 'Group'
    AND (traversal_ids @> ('{9970}'))
  )
  AND "projects"."marked_for_deletion_at" IS NULL
  AND "projects"."pending_delete" = FALSE
)
ORDER BY
  (licenses#>'{0,spdx_identifier}')::text ASC,
  (licenses#>'{1,spdx_identifier}')::text ASC
LIMIT 20 OFFSET 0

https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/22835/commands/73669

search by package manager

SELECT "sbom_occurrences".*
FROM "sbom_occurrences"
WHERE "sbom_occurrences"."project_id" IN (
  SELECT "projects"."id"
  FROM "projects"
  WHERE "projects"."namespace_id" IN (
    SELECT namespaces.traversal_ids[array_length(namespaces.traversal_ids, 1)] AS id
    FROM "namespaces"
    WHERE "namespaces"."type" = 'Group'
    AND (traversal_ids @> ('{9970}'))
  )
  AND "projects"."marked_for_deletion_at" IS NULL
  AND "projects"."pending_delete" = FALSE
)
AND "sbom_occurrences"."package_manager" = 'npm'  
ORDER BY "sbom_occurrences"."id" ASC
LIMIT 20 OFFSET 0

https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/22835/commands/73670

search by component name

SELECT "sbom_occurrences".*
FROM "sbom_occurrences"
WHERE "sbom_occurrences"."project_id" IN (
  SELECT "projects"."id"
  FROM "projects"
  WHERE "projects"."namespace_id" IN (
    SELECT namespaces.traversal_ids[array_length(namespaces.traversal_ids, 1)] AS id
    FROM "namespaces"
    WHERE "namespaces"."type" = 'Group'
    AND (traversal_ids @> ('{9970}'))
  )
  AND "projects"."marked_for_deletion_at" IS NULL
  AND "projects"."pending_delete" = FALSE
)
AND "sbom_occurrences"."component_name" = 'activerecord'
ORDER BY "sbom_occurrences"."id" ASC
LIMIT 20 OFFSET 0

https://console.postgres.ai/gitlab/gitlab-production-tunnel-pg12/sessions/22835/commands/73671

Screenshots or screen recordings

Before After
image image

MR acceptance checklist

This checklist encourages us to confirm any changes have been analyzed to reduce risks in quality, performance, reliability, security, and maintainability.

Edited by mo khan

Merge request reports