NPM Data Migration from CouchDB to deps.dev for License DB
Problem
The License DB project needs to migrate existing NPM data from CouchDB to a new source, deps.dev
, to enhance data handling and processing capabilities. The current setup does not support dual data sourcing, which is crucial for a smooth transition and rollback capabilities if necessary.
Solution
-
Step 1: Create New Table for Transitional Data We'll establish a new table called
npm_licenses_deps_dev
. This table will begin receiving data from deps.dev as we complete work on each segment, serving as a staging area before the full transition. (#459068 (closed)) -
Step 2: Preserve Original Data To ensure we have a reliable fallback, we will copy the existing
npm_licenses
table toold_npm_licenses
. This step secures our original data by providing a snapshot before the transition. -
Step 3: Data Transition Once we are confident in the integrity and completeness of the data in
npm_licenses_deps_dev
, we will drop the existingnpm_licenses
table, recreate it, and fill it with the data fromnpm_licenses_deps_dev
. This ensures continuity and integrity in the transition process. -
Step 4: Synchronize Timestamps We will run an UPDATE query with a LEFT JOIN to align
npm_licenses.last_updated
withold_npm_licenses.last_updated
wherever the entries match. This step is crucial for ensuring that the historical data's timestamps are preserved, allowing the new data from deps.dev to seamlessly integrate with our existing timeline. -
Step 5: Clean up and Finalize After verifying the transition is stable, we will remove the
npm_licenses_deps_dev
table and update the Processor to write directly tonpm_licenses
. This concludes the integration and cleanup phase.
Implementation Plan
Write Migrations for Identified Cases
-
Case 1: Insert Missing Components and Licenses
- Create a migration script to insert components from
deps_dev_npm_component
intonpm_component
if they do not exist. - Create a migration script to insert corresponding licenses from
deps_dev_npm_license
intonpm_license
.
- Create a migration script to insert components from
For example:
INSERT INTO npm_component (name, last_updated)
SELECT d.name, d.last_updated
FROM deps_dev_npm_component d
LEFT JOIN npm_component n ON d.name = n.name
WHERE n.name IS NULL;
INSERT INTO npm_license (component_id, license_ids, version, created, last_updated)
SELECT n.id, l.license_ids, l.version, l.created, l.last_updated
FROM deps_dev_npm_license l
JOIN deps_dev_npm_component d ON l.component_id = d.id
JOIN npm_component n ON d.name = n.name
LEFT JOIN npm_license nl ON n.id = nl.component_id AND l.version = nl.version
WHERE nl.component_id IS NULL;
-
Case 2: Update Existing Licenses
- Create a migration script to update
license_ids
andlast_updated
innpm_license
fromdeps_dev_npm_license
where they differ.
- Create a migration script to update
For example:
UPDATE npm_license nl
SET license_ids = l.license_ids, last_updated = l.last_updated
FROM deps_dev_npm_license l
JOIN deps_dev_npm_component d ON l.component_id = d.id
JOIN npm_component n ON d.name = n.name
WHERE nl.component_id = n.id AND nl.version = l.version
AND nl.license_ids IS DISTINCT FROM l.license_ids;
-
Case 3: Insert Missing Licenses
- Create a migration script to insert licenses from
deps_dev_npm_license
intonpm_license
if they do not exist.
- Create a migration script to insert licenses from
For example:
INSERT INTO npm_license (component_id, license_ids, version, created, last_updated)
SELECT n.id, l.license_ids, l.version, l.created, l.last_updated
FROM deps_dev_npm_license l
JOIN deps_dev_npm_component d ON l.component_id = d.id
JOIN npm_component n ON d.name = n.name
LEFT JOIN npm_license nl ON n.id = nl.component_id AND l.version = nl.version
WHERE nl.component_id IS NULL;
Write Tests for Migration Cases
-
Test Insert Missing Components and Licenses
- Set up test environment and database connection.
- Insert initial data into
deps_dev_npm_component
anddeps_dev_npm_license
. - Run the migration.
- Verify components are inserted into
npm_component
. - Verify licenses are inserted into
npm_license
.
func TestInsertMissingComponentsAndLicenses(t *testing.T) {
if os.Getenv("TEST_INTEGRATION") == "" {
t.Skip()
}
db := setupTestDB(t)
defer db.Close()
truncateTables(t, db)
_, err := db.Exec(`
INSERT INTO deps_dev_npm_component (name, last_updated) VALUES ('component1', now());
INSERT INTO deps_dev_npm_license (component_id, license_ids, version, created, last_updated) VALUES (1, '{1,2}', '1.0.0', now(), now());
`)
require.NoError(t, err)
require.NoError(t, migrations.RunGooseCommand("up", testEnv, os.Getenv("POSTGRES_CONN"), "files/", ""))
var componentCount int
err = db.QueryRow("SELECT COUNT(*) FROM npm_component WHERE name = 'component1'").Scan(&componentCount)
require.NoError(t, err)
require.Equal(t, 1, componentCount, "Expected 1 component to be inserted")
var licenseCount int
err = db.QueryRow("SELECT COUNT(*) FROM npm_license WHERE component_id = 1 AND version = '1.0.0'").Scan(&licenseCount)
require.NoError(t, err)
require.Equal(t, 1, licenseCount, "Expected 1 license to be inserted")
}
-
Test Update Existing Licenses
- Set up test environment and database connection.
- Insert initial data into
npm_component
,deps_dev_npm_component
,npm_license
, anddeps_dev_npm_license
. - Run the migration.
- Verify
license_ids
andlast_updated
innpm_license
are updated.
func TestUpdateExistingLicenses(t *testing.T) {
if os.Getenv("TEST_INTEGRATION") == "" {
t.Skip()
}
db := setupTestDB(t)
defer db.Close()
truncateTables(t, db)
_, err := db.Exec(`
INSERT INTO npm_component (name, last_updated) VALUES ('component1', now());
INSERT INTO deps_dev_npm_component (name, last_updated) VALUES ('component1', now());
INSERT INTO npm_license (component_id, license_ids, version, created, last_updated) VALUES (1, '{1,2}', '1.0.0', now(), now());
INSERT INTO deps_dev_npm_license (component_id, license_ids, version, created, last_updated) VALUES (1, '{2,3}', '1.0.0', now(), now());
`)
require.NoError(t, err)
require.NoError(t, migrations.RunGooseCommand("up", testEnv, os.Getenv("POSTGRES_CONN"), "files/", ""))
var licenseIDs string
err = db.QueryRow("SELECT license_ids FROM npm_license WHERE component_id = 1 AND version = '1.0.0'").Scan(&licenseIDs)
require.NoError(t, err)
require.Equal(t, "{2,3}", licenseIDs, "Expected license_ids to be '{2,3}'")
}
-
Test Insert Missing Licenses
- Set up test environment and database connection.
- Insert initial data into
npm_component
,deps_dev_npm_component
, anddeps_dev_npm_license
. - Run the migration.
- Verify licenses are inserted into
npm_license
.
func TestInsertMissingLicenses(t *testing.T) {
if os.Getenv("TEST_INTEGRATION") == "" {
t.Skip()
}
db := setupTestDB(t)
defer db.Close()
truncateTables(t, db)
_, err := db.Exec(`
INSERT INTO npm_component (name, last_updated) VALUES ('component1', now());
INSERT INTO deps_dev_npm_component (name, last_updated) VALUES ('component1', now());
INSERT INTO deps_dev_npm_license (component_id, license_ids, version, created, last_updated) VALUES (1, '{1,2}', '1.0.0', now(), now());
`)
require.NoError(t, err)
require.NoError(t, migrations.RunGooseCommand("up", testEnv, os.Getenv("POSTGRES_CONN"), "files/", ""))
var licenseCount int
err = db.QueryRow("SELECT COUNT(*) FROM npm_license WHERE component_id = 1 AND version = '1.0.0'").Scan(&licenseCount)
require.NoError(t, err)
require.Equal(t, 1, licenseCount, "Expected 1 license to be inserted")
}
Additional Steps
- Disable the
NPM exporter
inDEV
andPROD
- Disable the
NPM feeder
inDEV
andPROD
DEV
- Take a database snapshot
- Deploy the migrations
- Run the exporter
- Verify export results
- Disable the migration check in the
processor
, updatedeps.dev
sources to write tonpm_components
andnpm_licenses
and deploy - Switch the
NPM feeder
to usedeps.dev
sources - Verify export results of triggering the
NPM feeder
- Re-Enable the
NPM exporter
- Re-Enable the
NPM feeder
PROD
- Take a database snapshot
- Deploy the migrations
- Run the exporter
- Verify export results
- Disable the migration check in the
processor
, updatedeps.dev
sources to write tonpm_components
andnpm_licenses
and deploy - Switch the
NPM feeder
to usedeps.dev
sources - Verify export results of triggering the
NPM feeder
- Re-Enable the
NPM exporter
- Re-Enable the
NPM feeder
Cleanup
1. Drop deps_dev_licenses
and deps_dev_components