Skip to content
Snippets Groups Projects
Commit 4f1417f0 authored by Adam Hegyi's avatar Adam Hegyi
Browse files

Introducing loose index scan AR helper

This MR adds loose index scan helper to active record models and an
extension to EachBatch to iterate over distinct values in a column that
contains non-distinct values.
parent 2c80b38f
No related branches found
No related tags found
1 merge request!74493Implement utility functions for loose index scan
......@@ -2,6 +2,7 @@
module EachBatch
extend ActiveSupport::Concern
include LooseIndexScan
class_methods do
# Iterates over the rows in a relation in batches, similar to Rails'
......@@ -100,5 +101,65 @@ def each_batch(of: 1000, column: primary_key, order: :asc, order_hint: nil)
break unless stop
end
end
# Iterates over the rows in a relation in batches by skipping duplicated values in the column.
# Example: counting the number of distinct authors in `issues`
#
# - Table size: 100_000
# - Column: author_id
# - Distinct author_ids in the table: 1000
#
# The query will read maximum 1000 rows if we have index coverage on user_id.
#
# > count = 0
# > Issue.distinct_each_batch(column: 'author_id', of: 1000) { |r| count += r.count(:author_id) }
def distinct_each_batch(column:, order: :asc, of: 1000)
start = except(:select)
.select(column)
.reorder(column => order)
start = start.take
return unless start
start_id = start[column]
arel_table = self.arel_table
arel_column = arel_table[column.to_s]
1.step do |index|
stop = loose_index_scan(column: column, order: order) do |cte_query, inner_query|
if order == :asc
[cte_query.where(arel_column.gteq(start_id)), inner_query]
else
[cte_query.where(arel_column.lteq(start_id)), inner_query]
end
end.offset(of).take
if stop
stop_id = stop[column]
relation = loose_index_scan(column: column, order: order) do |cte_query, inner_query|
if order == :asc
[cte_query.where(arel_column.gteq(start_id)), inner_query.where(arel_column.lt(stop_id))]
else
[cte_query.where(arel_column.lteq(start_id)), inner_query.where(arel_column.gt(stop_id))]
end
end
start_id = stop_id
else
relation = loose_index_scan(column: column, order: order) do |cte_query, inner_query|
if order == :asc
[cte_query.where(arel_column.gteq(start_id)), inner_query]
else
[cte_query.where(arel_column.lteq(start_id)), inner_query]
end
end
end
unscoped { yield relation, index }
break unless stop
end
end
end
end
# frozen_string_literal: true
module LooseIndexScan
extend ActiveSupport::Concern
class_methods do
# Builds a recursive query to read distinct values from a column.
#
# Example 1: collect all distinct author ids for the `issues` table
#
# Bad: The DB reads all issues, sorts and dedups them in memory
#
# > Issue.select(:author_id).distinct.map(&:author_id)
#
# Good: Use loose index scan (skip index scan)
#
# > Issue.loose_index_scan(column: :author_id).map(&:author_id)
#
# Example 2: List of users for the DONE todos selector. Select all users who created a todo.
#
# Bad: Loads all DONE todos for the given user and extracts the author_ids
#
# > User.where(id: Todo.where(user_id: 4156052).done.select(:author_id))
#
# Good: Loads distinct author_ids from todos and then loads users
#
# > distinct_authors = Todo.where(user_id: 4156052).done.loose_index_scan(column: :author_id).select(:author_id)
# > User.where(id: distinct_authors)
def loose_index_scan(column:, order: :asc)
arel_table = self.arel_table
arel_column = arel_table[column.to_s]
cte = Gitlab::SQL::RecursiveCTE.new(:loose_index_scan_cte, union_args: { remove_order: false })
cte_query = except(:select)
.select(column)
.order(column => order)
.limit(1)
inner_query = except(:select)
cte_query, inner_query = yield([cte_query, inner_query]) if block_given?
cte << cte_query
inner_query = if order == :asc
inner_query.where(arel_column.gt(cte.table[column.to_s]))
else
inner_query.where(arel_column.lt(cte.table[column.to_s]))
end
inner_query = inner_query.order(column => order)
.select(column)
.limit(1)
cte << cte.table
.project(Arel::Nodes::Grouping.new(Arel.sql(inner_query.to_sql)).as(column.to_s))
unscoped do
with
.recursive(cte.to_arel)
.from(cte.alias_to(arel_table))
.where(arel_column.not_eq(nil)) # filtering out the last NULL value
end
end
end
end
......@@ -42,20 +42,20 @@ The API of this method is similar to `in_batches`, though it doesn't support
all of the arguments that `in_batches` supports. You should always use
`each_batch` _unless_ you have a specific need for `in_batches`.
## Avoid iterating over non-unique columns
## Iterating over non-unique columns
One should proceed with extra caution, and possibly avoid iterating over a column that can contain
duplicate values. When you iterate over an attribute that is not unique, even with the applied max
batch size, there is no guarantee that the resulting batches do not surpass it. The following
snippet demonstrates this situation when one attempt to select `Ci::Build` entries for users with
`id` between `1` and `10,000`, the database returns `1 215 178` matching rows.
One should proceed with extra caution. When you iterate over an attribute that is not unique,
even with the applied max batch size, there is no guarantee that the resulting batches do not
surpass it. The following snippet demonstrates this situation when one attempt to select
`Ci::Build` entries for users with `id` between `1` and `10,000`, the database returns
`1 215 178` matching rows.
```ruby
[ gstg ] production> Ci::Build.where(user_id: (1..10_000)).size
=> 1215178
```
This happens because built relation is translated into the following query
This happens because the built relation is translated into the following query:
```ruby
[ gstg ] production> puts Ci::Build.where(user_id: (1..10_000)).to_sql
......@@ -69,6 +69,27 @@ threshold does not translate to the size of the returned dataset. That happens b
`n` possible values of attributes, one can't tell for sure that the number of records that contains
them is less than `n`.
### Loose-index scan with `distinct_each_batch`
When iterating over a non-unique column is necessary, use the `distinct_each_batch` helper
method. The helper uses the [loose-index scan technique](https://wiki.postgresql.org/wiki/Loose_indexscan)
(skip-index scan) to skip duplicated values within a database index.
Example: iterating over distinct `author_id` in the Issue model
```ruby
Issue.distinct_each_batch(column: :author_id, of: 1000) do |relation|
users = User.where(id: relation.select(:author_id)).to_a
end
```
The technique provides stable performance between the batches regardless of the data distribution.
The `relation` object returns an ActiveRecord scope where only the given `column` is available.
Other columns are not loaded.
The underlying database queries use recursive CTEs, which adds extra overhead. We therefore advise to use
smaller batch sizes than those used for a standard `each_batch` iteration.
## Column definition
`EachBatch` uses the primary key of the model by default for the iteration. This works most of the
......
......@@ -3,17 +3,17 @@
require 'spec_helper'
RSpec.describe EachBatch do
describe '.each_batch' do
let(:model) do
Class.new(ActiveRecord::Base) do
include EachBatch
let(:model) do
Class.new(ActiveRecord::Base) do
include EachBatch
self.table_name = 'users'
self.table_name = 'users'
scope :never_signed_in, -> { where(sign_in_count: 0) }
end
scope :never_signed_in, -> { where(sign_in_count: 0) }
end
end
describe '.each_batch' do
before do
create_list(:user, 5, updated_at: 1.day.ago)
end
......@@ -86,4 +86,89 @@
end
end
end
describe '.distinct_each_batch' do
let_it_be(:users) { create_list(:user, 5, sign_in_count: 0) }
let(:params) { {} }
subject(:values) do
values = []
model.distinct_each_batch(**params) { |rel| values.concat(rel.pluck(params[:column])) }
values
end
context 'when iterating over a unique column' do
context 'when using ascending order' do
let(:expected_values) { users.pluck(:id).sort }
let(:params) { { column: :id, of: 1, order: :asc } }
it { is_expected.to eq(expected_values) }
context 'when using larger batch size' do
before do
params[:of] = 3
end
it { is_expected.to eq(expected_values) }
end
context 'when using larger batch size than the result size' do
before do
params[:of] = 100
end
it { is_expected.to eq(expected_values) }
end
end
context 'when using descending order' do
let(:expected_values) { users.pluck(:id).sort.reverse }
let(:params) { { column: :id, of: 1, order: :desc } }
it { is_expected.to eq(expected_values) }
context 'when using larger batch size' do
before do
params[:of] = 3
end
it { is_expected.to eq(expected_values) }
end
end
end
context 'when iterating over a non-unique column' do
let(:params) { { column: :sign_in_count, of: 2, order: :asc } }
context 'when only one value is present' do
it { is_expected.to eq([0]) }
end
context 'when duplicated values present' do
let(:expected_values) { [2, 5] }
before do
users[0].reload.update!(sign_in_count: 5)
users[1].reload.update!(sign_in_count: 2)
users[2].reload.update!(sign_in_count: 5)
users[3].reload.update!(sign_in_count: 2)
users[4].reload.update!(sign_in_count: 5)
end
it { is_expected.to eq(expected_values) }
context 'when using descending order' do
let(:expected_values) { [5, 2] }
before do
params[:order] = :desc
end
it { is_expected.to eq(expected_values) }
end
end
end
end
end
# frozen_string_literal: true
# frozen_string_literal
require 'spec_helper'
RSpec.describe LooseIndexScan, type: :model do
let(:issue_model) do
Class.new(ApplicationRecord) do
include LooseIndexScan
self.table_name = 'issues'
end
end
let_it_be(:user_1) { create(:user) }
let_it_be(:user_2) { create(:user) }
let_it_be(:user_3) { create(:user) }
let_it_be(:issue_1) { create(:issue, author: user_2) }
let_it_be(:issue_2) { create(:issue, author: user_1) }
let_it_be(:issue_3) { create(:issue, author: user_1) }
let_it_be(:issue_4) { create(:issue, author: user_2) }
let_it_be(:issue_5) { create(:issue, author: user_3) }
context 'loading distinct author_ids' do
subject(:author_ids) { issue_model.loose_index_scan(column: :author_id, order: order).pluck(:author_id) }
shared_examples 'assert distinct values example' do
it 'loads the distinct values in the correct order' do
expect(author_ids).to eq(expected_order)
end
end
context 'when using ascending order' do
let(:order) { :asc }
let(:expected_order) { [user_1.id, user_2.id, user_3.id] }
it_behaves_like 'assert distinct values example'
context 'when null values are present' do
before do
issue_1.author_id = nil
issue_1.save!(validate: false)
end
it_behaves_like 'assert distinct values example'
end
context 'when using descending order' do
let(:order) { :desc }
let(:expected_order) { [user_3.id, user_2.id, user_1.id] }
it_behaves_like 'assert distinct values example'
end
end
end
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment