Skip to content
Snippets Groups Projects
Commit 61f1bbc3 authored by charlie ablett's avatar charlie ablett :tools:
Browse files

Merge branch '414610-cablett-clickhouse-data-collector' into 'master'

ClickHouse data collector

See merge request gitlab-org/gitlab!127435



Merged-by: charlie ablett's avatarcharlie ablett <cablett@gitlab.com>
Co-authored-by: Adam Hegyi's avatarAdam Hegyi <ahegyi@gitlab.com>
parents 9401590e 013cdbc7
No related branches found
No related tags found
No related merge requests found
Showing
with 340 additions and 156 deletions
---
name: clickhouse_data_collection
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/127435
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/420257
milestone: '16.3'
type: development
group: group::optimize
default_enabled: false
CREATE TABLE contribution_analytics_events
(
id UInt64 DEFAULT 0,
path String DEFAULT '',
author_id UInt64 DEFAULT 0,
target_type LowCardinality(String) DEFAULT '',
action UInt8 DEFAULT 0,
created_at Date DEFAULT toYear(now()),
updated_at DateTime64(6, 'UTC') DEFAULT now()
)
ENGINE = MergeTree
ORDER BY (path, created_at, author_id, id);
CREATE MATERIALIZED VIEW contribution_analytics_events_mv
TO contribution_analytics_events
AS
SELECT
id,
argMax(path, events.updated_at) as path,
argMax(author_id, events.updated_at) as author_id,
argMax(target_type, events.updated_at) as target_type,
argMax(action, events.updated_at) as action,
argMax(date(created_at), events.updated_at) as created_at,
max(events.updated_at) as updated_at
FROM events
where (("events"."action" = 5 AND "events"."target_type" = '')
OR ("events"."action" IN (1, 3, 7, 12)
AND "events"."target_type" IN ('MergeRequest', 'Issue')))
GROUP BY id
# frozen_string_literal: true
module Gitlab
module ContributionAnalytics
class ClickHouseDataCollector
attr_reader :group, :from, :to
def initialize(group:, from:, to:)
@group = group
@from = from
@to = to
end
def totals_by_author_target_type_action
clickhouse_query = <<~CLICKHOUSE
SELECT count(*) as count_all,
"contribution_analytics_events"."author_id" AS events_author_id,
"contribution_analytics_events"."target_type" AS events_target_type,
"contribution_analytics_events"."action" AS events_action
FROM (
SELECT
id,
argMax(author_id, contribution_analytics_events.updated_at) as author_id,
argMax(target_type, contribution_analytics_events.updated_at) as target_type,
argMax(action, contribution_analytics_events.updated_at) as action
FROM contribution_analytics_events
WHERE startsWith(path, '#{group.traversal_ids.join('/')}/)
AND "contribution_analytics_events"."created_at" >= '#{from.strftime('%Y-%m-%d')}'
AND "contribution_analytics_events"."created_at" <= '#{to.strftime('%Y-%m-%d')}'
GROUP BY id
) contribution_analytics_events
GROUP BY "contribution_analytics_events"."action","contribution_analytics_events"."target_type","contribution_analytics_events"."author_id"
CLICKHOUSE
ClickHouse::Client.select(clickhouse_query, :main)
end
end
end
end
......@@ -30,7 +30,13 @@ def data_formatter
end
def db_data_collector
@data_formatter ||= PostgresqlDataCollector.new(group: group, from: from, to: to)
@data_formatter ||= db_collector_klass.new(group: group, from: from, to: to)
end
def db_collector_klass
return ClickHouseDataCollector if Feature.enabled?(:clickhouse_data_collection)
PostgresqlDataCollector
end
# Format:
......
......@@ -133,76 +133,96 @@ def create_push_event(author, project)
expect(assigns[:data_collector].totals[:total_events].values.sum).to eq(6)
end
it "returns member contributions JSON when format is JSON" do
get :show, params: { group_id: group.path }, format: :json
expect(json_response.length).to eq(3)
first_user = json_response.at(0)
expect(first_user["username"]).to eq(user.username)
expect(first_user["user_web_url"]).to eq("/#{user.username}")
expect(first_user["fullname"]).to eq(user.name)
expect(first_user["push"]).to eq(1)
expect(first_user["issues_created"]).to eq(0)
expect(first_user["issues_closed"]).to eq(1)
expect(first_user["merge_requests_created"]).to eq(0)
expect(first_user["merge_requests_merged"]).to eq(0)
expect(first_user["total_events"]).to eq(2)
end
shared_examples 'correct data is returned' do
it "returns member contributions JSON when format is JSON" do
get :show, params: { group_id: group.path }, format: :json
it "includes projects in subgroups" do
subgroup = create(:group, parent: group)
subproject = create(:project, :repository, group: subgroup)
expect(json_response.length).to eq(3)
first_user = json_response.at(0)
expect(first_user["username"]).to eq(user.username)
expect(first_user["user_web_url"]).to eq("/#{user.username}")
expect(first_user["fullname"]).to eq(user.name)
expect(first_user["push"]).to eq(1)
expect(first_user["issues_created"]).to eq(0)
expect(first_user["issues_closed"]).to eq(1)
expect(first_user["merge_requests_created"]).to eq(0)
expect(first_user["merge_requests_merged"]).to eq(0)
expect(first_user["total_events"]).to eq(2)
end
create_event(user, subproject, issue, :closed)
create_push_event(user, subproject)
it "includes projects in subgroups" do
subgroup = create(:group, parent: group)
subproject = create(:project, :repository, group: subgroup)
get :show, params: { group_id: group.path }, format: :json
create_event(user, subproject, issue, :closed)
create_push_event(user, subproject)
first_user = json_response.first
expect(first_user["issues_closed"]).to eq(2)
expect(first_user["push"]).to eq(2)
end
get :show, params: { group_id: group.path }, format: :json
it "excludes projects outside of the group" do
empty_group = create(:group)
other_project = create(:project, :repository)
first_user = json_response.first
expect(first_user["issues_closed"]).to eq(2)
expect(first_user["push"]).to eq(2)
end
empty_group.add_reporter(user)
it "excludes projects outside of the group" do
empty_group = create(:group)
other_project = create(:project, :repository)
create_event(user, other_project, issue, :closed)
create_push_event(user, other_project)
empty_group.add_reporter(user)
get :show, params: { group_id: empty_group.path }, format: :json
create_event(user, other_project, issue, :closed)
create_push_event(user, other_project)
expect(json_response).to be_empty
get :show, params: { group_id: empty_group.path }, format: :json
expect(json_response).to be_empty
end
end
it 'does not cause N+1 queries when the format is JSON' do
control_count = ActiveRecord::QueryRecorder.new do
get :show, params: { group_id: group.path }, format: :json
context 'when postgres is the data source' do
it_behaves_like 'correct data is returned'
it 'does not cause N+1 queries when the format is JSON' do
control_count = ActiveRecord::QueryRecorder.new do
get :show, params: { group_id: group.path }, format: :json
end
controller.instance_variable_set(:@group, nil)
user4 = create(:user)
group.add_member(user4, GroupMember::DEVELOPER)
expect { get :show, params: { group_id: group.path }, format: :json }
.not_to exceed_query_limit(control_count)
end
controller.instance_variable_set(:@group, nil)
user4 = create(:user)
group.add_member(user4, GroupMember::DEVELOPER)
describe 'with views' do
render_views
expect { get :show, params: { group_id: group.path }, format: :json }
.not_to exceed_query_limit(control_count)
end
it 'avoids a N+1 query in #show' do
# Warm the cache
get :show, params: { group_id: group.path }
describe 'with views' do
render_views
control_queries = ActiveRecord::QueryRecorder.new { get :show, params: { group_id: group.path } }
create_push_event(user, project)
it 'avoids a N+1 query in #show' do
# Warm the cache
get :show, params: { group_id: group.path }
expect { get :show, params: { group_id: group.path } }.not_to exceed_query_limit(control_queries)
end
end
end
control_queries = ActiveRecord::QueryRecorder.new { get :show, params: { group_id: group.path } }
create_push_event(user, project)
context 'when clickhouse is the data source', :click_house do
before do
stub_feature_flags(clickhouse_data_collection: true)
end
expect { get :show, params: { group_id: group.path } }.not_to exceed_query_limit(control_queries)
around do |example|
with_net_connect_allowed do
example.run
end
end
it_behaves_like 'correct data is returned'
end
describe 'GET #show' do
......
......@@ -30,13 +30,33 @@
contributions_query_path = 'analytics/contribution_analytics/graphql/contributions.query.graphql'
it "graphql/#{contributions_query_path}.json" do
query = get_graphql_query_as_string(contributions_query_path, ee: true)
shared_examples 'successful contribution analytics query' do
it "graphql/#{contributions_query_path}.json" do
query = get_graphql_query_as_string(contributions_query_path, ee: true)
post_graphql(query, current_user: user_1,
variables: { fullPath: group.full_path, startDate: '2023-03-12', endDate: '2023-04-12' })
post_graphql(query, current_user: user_1,
variables: { fullPath: group.full_path, startDate: '2023-03-12', endDate: '2023-04-12' })
expect_graphql_errors_to_be_empty
expect_graphql_errors_to_be_empty
end
end
context 'when postgres is the data source' do
it_behaves_like 'successful contribution analytics query'
end
context 'when clickhouse is the data source', :click_house do
before do
stub_feature_flags(clickhouse_data_collection: true)
end
around do |example|
with_net_connect_allowed do
example.run
end
end
it_behaves_like 'successful contribution analytics query'
end
end
......
......@@ -16,77 +16,97 @@ def resolve_contributions(args = {}, context = { current_user: current_user })
let_it_be(:user) { create(:user).tap { |u| group.add_developer(user) } }
let(:current_user) { user }
context 'without data' do
it { expect(resolve_contributions(args)).to be_empty }
end
shared_examples 'contributions resolver' do
context 'without data' do
it { expect(resolve_contributions(args)).to be_empty }
end
context 'with data' do
let_it_be(:another_user) { create(:user).tap { |u| group.add_developer(user) } }
context 'with data' do
let_it_be(:another_user) { create(:user).tap { |u| group.add_developer(user) } }
let_it_be(:event1) do
create(:event, :pushed, project: project, author: user, created_at: Date.parse('2022-04-27'))
end
let_it_be(:event1) do
create(:event, :pushed, project: project, author: user, created_at: Date.parse('2022-04-27'))
end
let_it_be(:event2) do
create(:event, :pushed, project: project, author: another_user, created_at: Date.parse('2022-05-01'))
end
let_it_be(:event2) do
create(:event, :pushed, project: project, author: another_user, created_at: Date.parse('2022-05-01'))
end
let_it_be(:event3) do
create(:event, :created, :for_issue, project: project, author: user, created_at: Date.parse('2022-05-05'))
end
let_it_be(:event3) do
create(:event, :created, :for_issue, project: project, author: user, created_at: Date.parse('2022-05-05'))
end
it 'returns the aggregated event counts' do
contributions = resolve_contributions(args)
expect(contributions).to eq([
{
user: user,
issues_closed: 0,
issues_created: 1,
merge_requests_approved: 0,
merge_requests_closed: 0,
merge_requests_created: 0,
merge_requests_merged: 0,
push: 1,
total_events: 2
},
{
user: another_user,
issues_closed: 0,
issues_created: 0,
merge_requests_approved: 0,
merge_requests_closed: 0,
merge_requests_created: 0,
merge_requests_merged: 0,
push: 1,
total_events: 1
}
])
end
it 'returns the aggregated event counts' do
contributions = resolve_contributions(args)
expect(contributions).to eq([
{
user: user,
issues_closed: 0,
issues_created: 1,
merge_requests_approved: 0,
merge_requests_closed: 0,
merge_requests_created: 0,
merge_requests_merged: 0,
push: 1,
total_events: 2
},
{
user: another_user,
issues_closed: 0,
issues_created: 0,
merge_requests_approved: 0,
merge_requests_closed: 0,
merge_requests_created: 0,
merge_requests_merged: 0,
push: 1,
total_events: 1
}
])
end
context 'when the date range is too wide' do
let(:args) { { from: Date.parse('2021-01-01'), to: Date.parse('2022-05-10') } }
context 'when the date range is too wide' do
let(:args) { { from: Date.parse('2021-01-01'), to: Date.parse('2022-05-10') } }
it 'raises error' do
error_message = s_('ContributionAnalytics|The given date range is larger than 93 days')
it 'raises error' do
error_message = s_('ContributionAnalytics|The given date range is larger than 93 days')
expect_graphql_error_to_be_created(Gitlab::Graphql::Errors::ArgumentError, error_message) do
resolve_contributions(args)
expect_graphql_error_to_be_created(Gitlab::Graphql::Errors::ArgumentError, error_message) do
resolve_contributions(args)
end
end
end
end
context 'when `to` is earlier than `from`' do
let(:args) { { to: Date.parse('2022-04-25'), from: Date.parse('2022-05-10') } }
context 'when `to` is earlier than `from`' do
let(:args) { { to: Date.parse('2022-04-25'), from: Date.parse('2022-05-10') } }
it 'raises error' do
error_message = s_('ContributionAnalytics|The to date is earlier than the given from date')
it 'raises error' do
error_message = s_('ContributionAnalytics|The to date is earlier than the given from date')
expect_graphql_error_to_be_created(Gitlab::Graphql::Errors::ArgumentError, error_message) do
resolve_contributions(args)
expect_graphql_error_to_be_created(Gitlab::Graphql::Errors::ArgumentError, error_message) do
resolve_contributions(args)
end
end
end
end
end
context 'when postgres is the data source' do
it_behaves_like 'contributions resolver'
end
context 'when clickhouse is the data source', :click_house do
before do
stub_feature_flags(clickhouse_data_collection: true)
end
around do |example|
with_net_connect_allowed do
example.run
end
end
it_behaves_like 'contributions resolver'
end
end
end
......@@ -24,28 +24,48 @@
create(:event, :pushed, project: project1, target: nil, author: user_2)
end
describe '#totals' do
it 'returns formatted data for received events' do
data_formatter = described_class.new(data)
expect(data_formatter.totals).to eq({
issues_closed: { user_1.id => 1 },
issues_created: {},
merge_requests_created: { user_1.id => 1 },
merge_requests_merged: {},
merge_requests_approved: { user_1.id => 1 },
merge_requests_closed: { user_1.id => 1 },
push: { user_1.id => 1, user_2.id => 1 },
total_events: { user_1.id => 5, user_2.id => 1 }
})
shared_examples 'correct collection of data' do
describe '#totals' do
it 'returns formatted data for received events' do
data_formatter = described_class.new(data)
expect(data_formatter.totals).to eq({
issues_closed: { user_1.id => 1 },
issues_created: {},
merge_requests_created: { user_1.id => 1 },
merge_requests_merged: {},
merge_requests_approved: { user_1.id => 1 },
merge_requests_closed: { user_1.id => 1 },
push: { user_1.id => 1, user_2.id => 1 },
total_events: { user_1.id => 5, user_2.id => 1 }
})
end
end
end
context 'when postgres is the data source' do
it_behaves_like 'correct collection of data'
describe '#users' do
it 'returns correct users' do
users = described_class.new(data).users
expect(users).to match_array([user_1, user_2])
end
end
end
describe '#users' do
it 'returns correct users' do
users = described_class.new(data).users
context 'when clickhouse is the data source', :click_house do
before do
stub_feature_flags(clickhouse_data_collection: true)
end
expect(users).to match_array([user_1, user_2])
around do |example|
with_net_connect_allowed do
example.run
end
end
it_behaves_like 'correct collection of data'
end
end
......@@ -11,19 +11,19 @@
let(:query) do
<<~QUERY
query($fullPath: ID!) {
group(fullPath: $fullPath) {
contributions(from: "2022-01-01", to: "2022-01-10") {
nodes {
user {
id
query($fullPath: ID!) {
group(fullPath: $fullPath) {
contributions(from: "2022-01-01", to: "2022-01-10") {
nodes {
user {
id
}
totalEvents
repoPushed
}
totalEvents
repoPushed
}
}
}
}
QUERY
end
......@@ -43,30 +43,50 @@
create(:event, :pushed, project: project, author: user, created_at: Date.parse('2022-01-05'))
end
it 'returns data' do
post_graphql(query, current_user: user, variables: { fullPath: group.full_path })
shared_examples 'returns correct data' do
it 'returns data' do
post_graphql(query, current_user: user, variables: { fullPath: group.full_path })
expect(graphql_data_at('group', 'contributions', 'nodes')).to eq([
{ 'user' => { 'id' => user.to_gid.to_s },
'totalEvents' => 1,
'repoPushed' => 1 }
])
expect(graphql_data_at('group', 'contributions', 'nodes')).to eq([
{ 'user' => { 'id' => user.to_gid.to_s },
'totalEvents' => 1,
'repoPushed' => 1 }
])
end
end
context 'with events from different users' do
def run_query
post_graphql(query, current_user: user, variables: { fullPath: group.full_path })
end
context 'when postgres is the data source' do
it_behaves_like 'returns correct data'
context 'with events from different users' do
def run_query
post_graphql(query, current_user: user, variables: { fullPath: group.full_path })
end
it 'does not create N+1 queries' do
# warm the query to avoid flakiness
run_query
it 'does not create N+1 queries' do
# warm the query to avoid flakiness
run_query
control_count = ActiveRecord::QueryRecorder.new { run_query }
control_count = ActiveRecord::QueryRecorder.new { run_query }
create(:event, :pushed, project: project, author: create(:user), created_at: Date.parse('2022-01-05'))
expect { run_query }.not_to exceed_all_query_limit(control_count)
create(:event, :pushed, project: project, author: create(:user), created_at: Date.parse('2022-01-05'))
expect { run_query }.not_to exceed_all_query_limit(control_count)
end
end
end
context 'when clickhouse is the data source', :click_house do
before do
stub_feature_flags(clickhouse_data_collection: true)
end
around do |example|
with_net_connect_allowed do
example.run
end
end
it_behaves_like 'returns correct data'
end
end
end
......@@ -345,6 +345,9 @@
# Keep-around refs should only be turned off for specific projects/repositories.
stub_feature_flags(disable_keep_around_refs: false)
# Postgres is the primary data source, and ClickHouse only when enabled in certain cases.
stub_feature_flags(clickhouse_data_collection: false)
allow(Gitlab::GitalyClient).to receive(:can_use_disk?).and_return(enable_rugged)
else
unstub_all_feature_flags
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment