Skip to content

Implement lock-retry mechanism

As menttioned in &13805 (comment 1899438887), we need some sort of lock-retry mechanism to attempt to run a transaction in a loop with a low lock_timeout, failing and retrying if we fail to get a lock. A lock-retry mechanism is a pattern where you attempt to execute a DDL transaction with a very short lock timeout, and if you can't acquire the necessary locks quickly, you abort and retry instead of blocking other queries indefinitely.

This helps because when we try to execute a rename query:

  1. PostgreSQL needs an ACCESS EXCLUSIVE lock on the blobs table
  2. If another query is using the table (even a simple SELECT), PostgreSQL waits
  3. All subsequent queries queue behind the migration, creating a pileup of blocked queries

with the lock retry mechanism we can instead:

  1. Try to acquire lock with 2-second timeout
  2. If can't get lock in 2s → ABORT (don't block anyone)
  3. Wait a few seconds (let current queries finish)
  4. Retry the transaction
  5. Repeat until you get the lock during a quiet moment
sample code

func executeWithLockRetry(ctx context.Context, db *sql.DB, statements []string, maxRetries int) error {
    for attempt := 0; attempt < maxRetries; attempt++ {
        tx, err := db.BeginTx(ctx, nil)
        if err != nil {
            return err
        }
        
        // Set aggressive lock timeout (2 seconds)
        _, err = tx.ExecContext(ctx, "SET LOCAL lock_timeout = '2s'")
        if err != nil {
            tx.Rollback()
            return err
        }
        
        // Try to execute all DDL statements
        for _, stmt := range statements {
            _, err = tx.ExecContext(ctx, stmt)
            if err != nil {
                tx.Rollback()
                
                // If it's a lock timeout and we have retries left, try again
                if isLockTimeout(err) && attempt < maxRetries-1 {
                    sleepTime := time.Duration(attempt+1) * time.Second
                    log.Printf("Lock timeout, retry %d/%d after %v", attempt+1, maxRetries, sleepTime)
                    time.Sleep(sleepTime) // Exponential backoff
                    break // Exit inner loop to retry
                }
                return err
            }
        }
        
        // Try to commit
        if err := tx.Commit(); err == nil {
            return nil // Success!
        }
        
        // Commit failed due to lock timeout, retry
        if isLockTimeout(err) && attempt < maxRetries-1 {
            time.Sleep(time.Duration(attempt+1) * time.Second)
            continue
        }
        return err
    }
    
    return fmt.Errorf("failed to acquire locks after %d attempts", maxRetries)
}

func isLockTimeout(err error) bool {
    return strings.Contains(err.Error(), "lock_timeout") || 
           strings.Contains(err.Error(), "canceling statement due to lock timeout")
}
Edited by SAhmed