Skip to content
Snippets Groups Projects
Commit ae886c54 authored by David Fernandez's avatar David Fernandez :palm_tree:
Browse files

Merge branch '390098-delete-the-duplicate-npm-packages-from-the-database' into 'master'

Add migration to mark duplicate npm packages for destruction

See merge request !114695



Merged-by: David Fernandez's avatarDavid Fernandez <dfernandez@gitlab.com>
Approved-by: default avatarMoaz Khalifa <mkhalifa@gitlab.com>
Approved-by: default avatarMichał Zając <mzajac@gitlab.com>
Approved-by: David Fernandez's avatarDavid Fernandez <dfernandez@gitlab.com>
Reviewed-by: David Fernandez's avatarDavid Fernandez <dfernandez@gitlab.com>
Reviewed-by: Terri Chu's avatarTerri Chu <tchu@gitlab.com>
Reviewed-by: default avatarMoaz Khalifa <mkhalifa@gitlab.com>
Co-authored-by: default avatarDzmitry Meshcharakou <12459192-dmeshcharakou@users.noreply.gitlab.com>
Co-authored-by: Terri Chu's avatarTerri Chu <tchu@gitlab.com>
parents 3350680c b4044dd3
No related branches found
No related tags found
2 merge requests!119439Draft: Prevent file variable content expansion in downstream pipeline,!114695Add migration to mark duplicate npm packages for destruction
Pipeline #896962601 failed
Pipeline: E2E Omnibus GitLab EE

#897056661

    Pipeline: GitLab

    #896992976

      Pipeline: E2E GDK

      #896981562

        ---
        migration_job_name: MarkDuplicateNpmPackagesForDestruction
        description: It seeks duplicate npm packages and marks them for destruction.
        feature_category: package_registry
        introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/114695
        milestone: 16.1
        # frozen_string_literal: true
        class AddTempIndexToPackagesOnProjectIdWhenNpmAndNotPendingDestruction < Gitlab::Database::Migration[2.1]
        disable_ddl_transaction!
        INDEX_NAME = 'tmp_idx_packages_on_project_id_when_npm_not_pending_destruction'
        NPM_PACKAGE_TYPE = 2
        PENDING_DESTRUCTION_STATUS = 4
        def up
        # Temporary index to be removed in 16.2 https://gitlab.com/gitlab-org/gitlab/-/issues/414216
        add_concurrent_index(
        :packages_packages,
        :project_id,
        name: INDEX_NAME,
        where: "package_type = #{NPM_PACKAGE_TYPE} AND status <> #{PENDING_DESTRUCTION_STATUS}"
        )
        end
        def down
        remove_concurrent_index_by_name :packages_packages, INDEX_NAME
        end
        end
        # frozen_string_literal: true
        class QueueMarkDuplicateNpmPackagesForDestruction < Gitlab::Database::Migration[2.1]
        MIGRATION = 'MarkDuplicateNpmPackagesForDestruction'
        DELAY_INTERVAL = 2.minutes
        BATCH_SIZE = 5000
        BATCH_CLASS_NAME = 'LooseIndexScanBatchingStrategy'
        SUB_BATCH_SIZE = 500
        disable_ddl_transaction!
        restrict_gitlab_migration gitlab_schema: :gitlab_main
        def up
        queue_batched_background_migration(
        MIGRATION,
        :packages_packages,
        :project_id,
        job_interval: DELAY_INTERVAL,
        batch_size: BATCH_SIZE,
        batch_class_name: BATCH_CLASS_NAME,
        sub_batch_size: SUB_BATCH_SIZE
        )
        end
        def down
        delete_batched_background_migration(MIGRATION, :packages_packages, :project_id, [])
        end
        end
        3709d98612eca5bfe996c20770b313322cde010fbf5b1e69c8ef688a456e1ace
        \ No newline at end of file
        65b99cce227ef0a66cc83624170c680cc6edf72bd7e30e7c562104f4dbebabf9
        \ No newline at end of file
        ......@@ -33563,6 +33563,8 @@ CREATE INDEX tmp_idx_for_feedback_comment_processing ON vulnerability_feedback U
         
        CREATE INDEX tmp_idx_for_vulnerability_feedback_migration ON vulnerability_feedback USING btree (id) WHERE ((migrated_to_state_transition = false) AND (feedback_type = 0));
         
        CREATE INDEX tmp_idx_packages_on_project_id_when_npm_not_pending_destruction ON packages_packages USING btree (project_id) WHERE ((package_type = 2) AND (status <> 4));
        CREATE INDEX tmp_idx_vulnerability_occurrences_on_id_where_report_type_7_99 ON vulnerability_occurrences USING btree (id) WHERE (report_type = ANY (ARRAY[7, 99]));
         
        CREATE INDEX tmp_index_ci_job_artifacts_on_expire_at_where_locked_unknown ON ci_job_artifacts USING btree (expire_at, job_id) WHERE ((locked = 2) AND (expire_at IS NOT NULL));
        # frozen_string_literal: true
        module Gitlab
        module BackgroundMigration
        # It seeks duplicate npm packages and mark them for destruction
        class MarkDuplicateNpmPackagesForDestruction < BatchedMigrationJob
        NPM_PACKAGE_TYPE = 2
        PENDING_DESTRUCTION_STATUS = 4
        operation_name :update_all
        feature_category :package_registry
        # Temporary class to link AR model to the `packages_packages` table
        class Package < ::ApplicationRecord
        include EachBatch
        self.table_name = 'packages_packages'
        end
        def perform
        distinct_each_batch do |batch|
        project_ids = batch.pluck(:project_id)
        subquery = Package
        .where(project_id: project_ids, package_type: NPM_PACKAGE_TYPE)
        .where.not(status: PENDING_DESTRUCTION_STATUS)
        .select('project_id, name, version, MAX(id) AS max_id')
        .group(:project_id, :name, :version)
        .having('COUNT(*) > 1')
        join_query = <<~SQL.squish
        INNER JOIN (#{subquery.to_sql}) AS duplicates
        ON packages_packages.project_id = duplicates.project_id
        AND packages_packages.name = duplicates.name
        AND packages_packages.version = duplicates.version
        SQL
        Package
        .joins(join_query)
        .where.not('packages_packages.id = duplicates.max_id')
        .each_batch do |batch_to_update|
        batch_to_update.update_all(status: PENDING_DESTRUCTION_STATUS)
        end
        end
        end
        end
        end
        end
        # frozen_string_literal: true
        require 'spec_helper'
        RSpec.describe Gitlab::BackgroundMigration::MarkDuplicateNpmPackagesForDestruction, schema: 20230524201454, feature_category: :package_registry do # rubocop:disable Layout/LineLength
        describe '#perform' do
        let(:projects_table) { table(:projects) }
        let(:namespaces_table) { table(:namespaces) }
        let(:packages_table) { table(:packages_packages) }
        let!(:namespace) do
        namespaces_table.create!(name: 'project', path: 'project', type: 'Project')
        end
        let!(:project) do
        projects_table.create!(
        namespace_id: namespace.id,
        name: 'project',
        path: 'project',
        project_namespace_id: namespace.id
        )
        end
        let!(:package_1) do
        packages_table.create!(
        project_id: project.id,
        name: 'test1',
        version: '1.0.0',
        package_type: described_class::NPM_PACKAGE_TYPE
        )
        end
        let!(:package_2) do
        packages_table.create!(
        project_id: project.id,
        name: 'test2',
        version: '1.0.0',
        package_type: described_class::NPM_PACKAGE_TYPE
        )
        end
        let!(:package_3) do
        packages_table.create!(
        project_id: project.id,
        name: 'test3',
        version: '1.0.0',
        package_type: described_class::NPM_PACKAGE_TYPE
        )
        end
        let(:migration) do
        described_class.new(
        start_id: projects_table.minimum(:id),
        end_id: projects_table.maximum(:id),
        batch_table: :packages_packages,
        batch_column: :project_id,
        sub_batch_size: 10,
        pause_ms: 0,
        connection: ApplicationRecord.connection
        )
        end
        before do
        # create a duplicated package without triggering model validation errors
        package_2.update_column(:name, package_1.name)
        package_3.update_column(:name, package_1.name)
        end
        it 'marks duplicate npm packages for destruction', :aggregate_failures do
        packages_marked_for_destruction = described_class::Package
        .where(status: described_class::PENDING_DESTRUCTION_STATUS)
        expect { migration.perform }
        .to change { packages_marked_for_destruction.count }.from(0).to(2)
        expect(package_3.reload.status).not_to eq(described_class::PENDING_DESTRUCTION_STATUS)
        end
        end
        end
        # frozen_string_literal: true
        require 'spec_helper'
        require_migration!
        RSpec.describe QueueMarkDuplicateNpmPackagesForDestruction, feature_category: :package_registry do
        let!(:batched_migration) { described_class::MIGRATION }
        it 'schedules a new batched migration' do
        reversible_migration do |migration|
        migration.before -> {
        expect(batched_migration).not_to have_scheduled_batched_migration
        }
        migration.after -> {
        expect(batched_migration).to have_scheduled_batched_migration(
        table_name: :packages_packages,
        column_name: :project_id,
        interval: described_class::DELAY_INTERVAL,
        batch_size: described_class::BATCH_SIZE,
        batch_class_name: described_class::BATCH_CLASS_NAME,
        sub_batch_size: described_class::SUB_BATCH_SIZE
        )
        }
        end
        end
        end
        0% Loading or .
        You are about to add 0 people to the discussion. Proceed with caution.
        Finish editing this message first!
        Please register or to comment