Skip to content
Snippets Groups Projects
Commit 013cdbc7 authored by charlie ablett's avatar charlie ablett 🛠️
Browse files

ClickHouse data collector for contribution analytics

Changelog: added
EE: true
parent 87bff275
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !127435. Comments created here will be created in the context of that merge request.
Showing
with 340 additions and 266 deletions
...@@ -452,24 +452,6 @@ ...@@ -452,24 +452,6 @@
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1 CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
CLICKHOUSE_DB: gitlab_clickhouse_test CLICKHOUSE_DB: gitlab_clickhouse_test
.use-pg14-clickhouse23:
services:
- name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-14-pgvector-0.4.1
command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"]
alias: postgres
- name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:redis-cluster-6.2.12
alias: rediscluster # configure connections in config/redis.yml
- name: redis:6.2-alpine
- name: clickhouse/clickhouse-server:23-alpine
alias: clickhouse
variables:
POSTGRES_HOST_AUTH_METHOD: trust
PG_VERSION: "14"
CLICKHOUSE_USER: clickhouse
CLICKHOUSE_PASSWORD: clickhouse
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1
CLICKHOUSE_DB: gitlab_clickhouse_test
.use-kaniko: .use-kaniko:
image: image:
name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:kaniko name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:kaniko
......
---
name: clickhouse_data_collection
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/127435
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/420257
milestone: '16.3'
type: development
group: group::optimize
default_enabled: false
CREATE TABLE contribution_analytics_events
(
id UInt64 DEFAULT 0,
path String DEFAULT '',
author_id UInt64 DEFAULT 0,
target_type LowCardinality(String) DEFAULT '',
action UInt8 DEFAULT 0,
created_at Date DEFAULT toYear(now()),
updated_at DateTime64(6, 'UTC') DEFAULT now()
)
ENGINE = MergeTree
ORDER BY (path, created_at, author_id, id);
CREATE MATERIALIZED VIEW contribution_analytics_events_mv
TO contribution_analytics_events
AS
SELECT
id,
argMax(path, events.updated_at) as path,
argMax(author_id, events.updated_at) as author_id,
argMax(target_type, events.updated_at) as target_type,
argMax(action, events.updated_at) as action,
argMax(date(created_at), events.updated_at) as created_at,
max(events.updated_at) as updated_at
FROM events
where (("events"."action" = 5 AND "events"."target_type" = '')
OR ("events"."action" IN (1, 3, 7, 12)
AND "events"."target_type" IN ('MergeRequest', 'Issue')))
GROUP BY id
# frozen_string_literal: true
module Gitlab
module ContributionAnalytics
class ClickHouseDataCollector
attr_reader :group, :from, :to
def initialize(group:, from:, to:)
@group = group
@from = from
@to = to
end
def totals_by_author_target_type_action
clickhouse_query = <<~CLICKHOUSE
SELECT count(*) as count_all,
"contribution_analytics_events"."author_id" AS events_author_id,
"contribution_analytics_events"."target_type" AS events_target_type,
"contribution_analytics_events"."action" AS events_action
FROM (
SELECT
id,
argMax(author_id, contribution_analytics_events.updated_at) as author_id,
argMax(target_type, contribution_analytics_events.updated_at) as target_type,
argMax(action, contribution_analytics_events.updated_at) as action
FROM contribution_analytics_events
WHERE startsWith(path, '#{group.traversal_ids.join('/')}/)
AND "contribution_analytics_events"."created_at" >= '#{from.strftime('%Y-%m-%d')}'
AND "contribution_analytics_events"."created_at" <= '#{to.strftime('%Y-%m-%d')}'
GROUP BY id
) contribution_analytics_events
GROUP BY "contribution_analytics_events"."action","contribution_analytics_events"."target_type","contribution_analytics_events"."author_id"
CLICKHOUSE
ClickHouse::Client.select(clickhouse_query, :main)
end
end
end
end
...@@ -30,7 +30,13 @@ def data_formatter ...@@ -30,7 +30,13 @@ def data_formatter
end end
def db_data_collector def db_data_collector
@data_formatter ||= PostgresqlDataCollector.new(group: group, from: from, to: to) @data_formatter ||= db_collector_klass.new(group: group, from: from, to: to)
end
def db_collector_klass
return ClickHouseDataCollector if Feature.enabled?(:clickhouse_data_collection)
PostgresqlDataCollector
end end
# Format: # Format:
......
...@@ -133,76 +133,96 @@ def create_push_event(author, project) ...@@ -133,76 +133,96 @@ def create_push_event(author, project)
expect(assigns[:data_collector].totals[:total_events].values.sum).to eq(6) expect(assigns[:data_collector].totals[:total_events].values.sum).to eq(6)
end end
it "returns member contributions JSON when format is JSON" do shared_examples 'correct data is returned' do
get :show, params: { group_id: group.path }, format: :json it "returns member contributions JSON when format is JSON" do
get :show, params: { group_id: group.path }, format: :json
expect(json_response.length).to eq(3)
first_user = json_response.at(0)
expect(first_user["username"]).to eq(user.username)
expect(first_user["user_web_url"]).to eq("/#{user.username}")
expect(first_user["fullname"]).to eq(user.name)
expect(first_user["push"]).to eq(1)
expect(first_user["issues_created"]).to eq(0)
expect(first_user["issues_closed"]).to eq(1)
expect(first_user["merge_requests_created"]).to eq(0)
expect(first_user["merge_requests_merged"]).to eq(0)
expect(first_user["total_events"]).to eq(2)
end
it "includes projects in subgroups" do expect(json_response.length).to eq(3)
subgroup = create(:group, parent: group)
subproject = create(:project, :repository, group: subgroup) first_user = json_response.at(0)
expect(first_user["username"]).to eq(user.username)
expect(first_user["user_web_url"]).to eq("/#{user.username}")
expect(first_user["fullname"]).to eq(user.name)
expect(first_user["push"]).to eq(1)
expect(first_user["issues_created"]).to eq(0)
expect(first_user["issues_closed"]).to eq(1)
expect(first_user["merge_requests_created"]).to eq(0)
expect(first_user["merge_requests_merged"]).to eq(0)
expect(first_user["total_events"]).to eq(2)
end
create_event(user, subproject, issue, :closed) it "includes projects in subgroups" do
create_push_event(user, subproject) subgroup = create(:group, parent: group)
subproject = create(:project, :repository, group: subgroup)
get :show, params: { group_id: group.path }, format: :json create_event(user, subproject, issue, :closed)
create_push_event(user, subproject)
first_user = json_response.first get :show, params: { group_id: group.path }, format: :json
expect(first_user["issues_closed"]).to eq(2)
expect(first_user["push"]).to eq(2)
end
it "excludes projects outside of the group" do first_user = json_response.first
empty_group = create(:group) expect(first_user["issues_closed"]).to eq(2)
other_project = create(:project, :repository) expect(first_user["push"]).to eq(2)
end
empty_group.add_reporter(user) it "excludes projects outside of the group" do
empty_group = create(:group)
other_project = create(:project, :repository)
create_event(user, other_project, issue, :closed) empty_group.add_reporter(user)
create_push_event(user, other_project)
get :show, params: { group_id: empty_group.path }, format: :json create_event(user, other_project, issue, :closed)
create_push_event(user, other_project)
expect(json_response).to be_empty get :show, params: { group_id: empty_group.path }, format: :json
expect(json_response).to be_empty
end
end end
it 'does not cause N+1 queries when the format is JSON' do context 'when postgres is the data source' do
control_count = ActiveRecord::QueryRecorder.new do it_behaves_like 'correct data is returned'
get :show, params: { group_id: group.path }, format: :json
it 'does not cause N+1 queries when the format is JSON' do
control_count = ActiveRecord::QueryRecorder.new do
get :show, params: { group_id: group.path }, format: :json
end
controller.instance_variable_set(:@group, nil)
user4 = create(:user)
group.add_member(user4, GroupMember::DEVELOPER)
expect { get :show, params: { group_id: group.path }, format: :json }
.not_to exceed_query_limit(control_count)
end end
controller.instance_variable_set(:@group, nil) describe 'with views' do
user4 = create(:user) render_views
group.add_member(user4, GroupMember::DEVELOPER)
expect { get :show, params: { group_id: group.path }, format: :json } it 'avoids a N+1 query in #show' do
.not_to exceed_query_limit(control_count) # Warm the cache
end get :show, params: { group_id: group.path }
describe 'with views' do control_queries = ActiveRecord::QueryRecorder.new { get :show, params: { group_id: group.path } }
render_views create_push_event(user, project)
it 'avoids a N+1 query in #show' do expect { get :show, params: { group_id: group.path } }.not_to exceed_query_limit(control_queries)
# Warm the cache end
get :show, params: { group_id: group.path } end
end
control_queries = ActiveRecord::QueryRecorder.new { get :show, params: { group_id: group.path } } context 'when clickhouse is the data source', :click_house do
create_push_event(user, project) before do
stub_feature_flags(clickhouse_data_collection: true)
end
expect { get :show, params: { group_id: group.path } }.not_to exceed_query_limit(control_queries) around do |example|
with_net_connect_allowed do
example.run
end
end end
it_behaves_like 'correct data is returned'
end end
describe 'GET #show' do describe 'GET #show' do
......
...@@ -30,13 +30,33 @@ ...@@ -30,13 +30,33 @@
contributions_query_path = 'analytics/contribution_analytics/graphql/contributions.query.graphql' contributions_query_path = 'analytics/contribution_analytics/graphql/contributions.query.graphql'
it "graphql/#{contributions_query_path}.json" do shared_examples 'successful contribution analytics query' do
query = get_graphql_query_as_string(contributions_query_path, ee: true) it "graphql/#{contributions_query_path}.json" do
query = get_graphql_query_as_string(contributions_query_path, ee: true)
post_graphql(query, current_user: user_1, post_graphql(query, current_user: user_1,
variables: { fullPath: group.full_path, startDate: '2023-03-12', endDate: '2023-04-12' }) variables: { fullPath: group.full_path, startDate: '2023-03-12', endDate: '2023-04-12' })
expect_graphql_errors_to_be_empty expect_graphql_errors_to_be_empty
end
end
context 'when postgres is the data source' do
it_behaves_like 'successful contribution analytics query'
end
context 'when clickhouse is the data source', :click_house do
before do
stub_feature_flags(clickhouse_data_collection: true)
end
around do |example|
with_net_connect_allowed do
example.run
end
end
it_behaves_like 'successful contribution analytics query'
end end
end end
......
...@@ -16,77 +16,97 @@ def resolve_contributions(args = {}, context = { current_user: current_user }) ...@@ -16,77 +16,97 @@ def resolve_contributions(args = {}, context = { current_user: current_user })
let_it_be(:user) { create(:user).tap { |u| group.add_developer(user) } } let_it_be(:user) { create(:user).tap { |u| group.add_developer(user) } }
let(:current_user) { user } let(:current_user) { user }
context 'without data' do shared_examples 'contributions resolver' do
it { expect(resolve_contributions(args)).to be_empty } context 'without data' do
end it { expect(resolve_contributions(args)).to be_empty }
end
context 'with data' do context 'with data' do
let_it_be(:another_user) { create(:user).tap { |u| group.add_developer(user) } } let_it_be(:another_user) { create(:user).tap { |u| group.add_developer(user) } }
let_it_be(:event1) do let_it_be(:event1) do
create(:event, :pushed, project: project, author: user, created_at: Date.parse('2022-04-27')) create(:event, :pushed, project: project, author: user, created_at: Date.parse('2022-04-27'))
end end
let_it_be(:event2) do let_it_be(:event2) do
create(:event, :pushed, project: project, author: another_user, created_at: Date.parse('2022-05-01')) create(:event, :pushed, project: project, author: another_user, created_at: Date.parse('2022-05-01'))
end end
let_it_be(:event3) do let_it_be(:event3) do
create(:event, :created, :for_issue, project: project, author: user, created_at: Date.parse('2022-05-05')) create(:event, :created, :for_issue, project: project, author: user, created_at: Date.parse('2022-05-05'))
end end
it 'returns the aggregated event counts' do it 'returns the aggregated event counts' do
contributions = resolve_contributions(args) contributions = resolve_contributions(args)
expect(contributions).to eq([ expect(contributions).to eq([
{ {
user: user, user: user,
issues_closed: 0, issues_closed: 0,
issues_created: 1, issues_created: 1,
merge_requests_approved: 0, merge_requests_approved: 0,
merge_requests_closed: 0, merge_requests_closed: 0,
merge_requests_created: 0, merge_requests_created: 0,
merge_requests_merged: 0, merge_requests_merged: 0,
push: 1, push: 1,
total_events: 2 total_events: 2
}, },
{ {
user: another_user, user: another_user,
issues_closed: 0, issues_closed: 0,
issues_created: 0, issues_created: 0,
merge_requests_approved: 0, merge_requests_approved: 0,
merge_requests_closed: 0, merge_requests_closed: 0,
merge_requests_created: 0, merge_requests_created: 0,
merge_requests_merged: 0, merge_requests_merged: 0,
push: 1, push: 1,
total_events: 1 total_events: 1
} }
]) ])
end end
context 'when the date range is too wide' do context 'when the date range is too wide' do
let(:args) { { from: Date.parse('2021-01-01'), to: Date.parse('2022-05-10') } } let(:args) { { from: Date.parse('2021-01-01'), to: Date.parse('2022-05-10') } }
it 'raises error' do it 'raises error' do
error_message = s_('ContributionAnalytics|The given date range is larger than 93 days') error_message = s_('ContributionAnalytics|The given date range is larger than 93 days')
expect_graphql_error_to_be_created(Gitlab::Graphql::Errors::ArgumentError, error_message) do expect_graphql_error_to_be_created(Gitlab::Graphql::Errors::ArgumentError, error_message) do
resolve_contributions(args) resolve_contributions(args)
end
end end
end end
end
context 'when `to` is earlier than `from`' do context 'when `to` is earlier than `from`' do
let(:args) { { to: Date.parse('2022-04-25'), from: Date.parse('2022-05-10') } } let(:args) { { to: Date.parse('2022-04-25'), from: Date.parse('2022-05-10') } }
it 'raises error' do it 'raises error' do
error_message = s_('ContributionAnalytics|The to date is earlier than the given from date') error_message = s_('ContributionAnalytics|The to date is earlier than the given from date')
expect_graphql_error_to_be_created(Gitlab::Graphql::Errors::ArgumentError, error_message) do expect_graphql_error_to_be_created(Gitlab::Graphql::Errors::ArgumentError, error_message) do
resolve_contributions(args) resolve_contributions(args)
end
end end
end end
end end
end end
context 'when postgres is the data source' do
it_behaves_like 'contributions resolver'
end
context 'when clickhouse is the data source', :click_house do
before do
stub_feature_flags(clickhouse_data_collection: true)
end
around do |example|
with_net_connect_allowed do
example.run
end
end
it_behaves_like 'contributions resolver'
end
end end
end end
...@@ -24,28 +24,48 @@ ...@@ -24,28 +24,48 @@
create(:event, :pushed, project: project1, target: nil, author: user_2) create(:event, :pushed, project: project1, target: nil, author: user_2)
end end
describe '#totals' do shared_examples 'correct collection of data' do
it 'returns formatted data for received events' do describe '#totals' do
data_formatter = described_class.new(data) it 'returns formatted data for received events' do
data_formatter = described_class.new(data)
expect(data_formatter.totals).to eq({
issues_closed: { user_1.id => 1 }, expect(data_formatter.totals).to eq({
issues_created: {}, issues_closed: { user_1.id => 1 },
merge_requests_created: { user_1.id => 1 }, issues_created: {},
merge_requests_merged: {}, merge_requests_created: { user_1.id => 1 },
merge_requests_approved: { user_1.id => 1 }, merge_requests_merged: {},
merge_requests_closed: { user_1.id => 1 }, merge_requests_approved: { user_1.id => 1 },
push: { user_1.id => 1, user_2.id => 1 }, merge_requests_closed: { user_1.id => 1 },
total_events: { user_1.id => 5, user_2.id => 1 } push: { user_1.id => 1, user_2.id => 1 },
}) total_events: { user_1.id => 5, user_2.id => 1 }
})
end
end
end
context 'when postgres is the data source' do
it_behaves_like 'correct collection of data'
describe '#users' do
it 'returns correct users' do
users = described_class.new(data).users
expect(users).to match_array([user_1, user_2])
end
end end
end end
describe '#users' do context 'when clickhouse is the data source', :click_house do
it 'returns correct users' do before do
users = described_class.new(data).users stub_feature_flags(clickhouse_data_collection: true)
end
expect(users).to match_array([user_1, user_2]) around do |example|
with_net_connect_allowed do
example.run
end
end end
it_behaves_like 'correct collection of data'
end end
end end
...@@ -11,19 +11,19 @@ ...@@ -11,19 +11,19 @@
let(:query) do let(:query) do
<<~QUERY <<~QUERY
query($fullPath: ID!) { query($fullPath: ID!) {
group(fullPath: $fullPath) { group(fullPath: $fullPath) {
contributions(from: "2022-01-01", to: "2022-01-10") { contributions(from: "2022-01-01", to: "2022-01-10") {
nodes { nodes {
user { user {
id id
}
totalEvents
repoPushed
} }
totalEvents
repoPushed
} }
} }
} }
}
QUERY QUERY
end end
...@@ -43,30 +43,50 @@ ...@@ -43,30 +43,50 @@
create(:event, :pushed, project: project, author: user, created_at: Date.parse('2022-01-05')) create(:event, :pushed, project: project, author: user, created_at: Date.parse('2022-01-05'))
end end
it 'returns data' do shared_examples 'returns correct data' do
post_graphql(query, current_user: user, variables: { fullPath: group.full_path }) it 'returns data' do
post_graphql(query, current_user: user, variables: { fullPath: group.full_path })
expect(graphql_data_at('group', 'contributions', 'nodes')).to eq([ expect(graphql_data_at('group', 'contributions', 'nodes')).to eq([
{ 'user' => { 'id' => user.to_gid.to_s }, { 'user' => { 'id' => user.to_gid.to_s },
'totalEvents' => 1, 'totalEvents' => 1,
'repoPushed' => 1 } 'repoPushed' => 1 }
]) ])
end
end end
context 'with events from different users' do context 'when postgres is the data source' do
def run_query it_behaves_like 'returns correct data'
post_graphql(query, current_user: user, variables: { fullPath: group.full_path })
end context 'with events from different users' do
def run_query
post_graphql(query, current_user: user, variables: { fullPath: group.full_path })
end
it 'does not create N+1 queries' do it 'does not create N+1 queries' do
# warm the query to avoid flakiness # warm the query to avoid flakiness
run_query run_query
control_count = ActiveRecord::QueryRecorder.new { run_query } control_count = ActiveRecord::QueryRecorder.new { run_query }
create(:event, :pushed, project: project, author: create(:user), created_at: Date.parse('2022-01-05')) create(:event, :pushed, project: project, author: create(:user), created_at: Date.parse('2022-01-05'))
expect { run_query }.not_to exceed_all_query_limit(control_count) expect { run_query }.not_to exceed_all_query_limit(control_count)
end
end end
end end
context 'when clickhouse is the data source', :click_house do
before do
stub_feature_flags(clickhouse_data_collection: true)
end
around do |example|
with_net_connect_allowed do
example.run
end
end
it_behaves_like 'returns correct data'
end
end end
end end
...@@ -110,96 +110,4 @@ def format_row(event) ...@@ -110,96 +110,4 @@ def format_row(event)
end end
end end
end end
describe 'querying data', :click_house do
around do |example|
with_net_connect_allowed do
example.run
end
end
it 'returns data from the DB' do
result = Gitlab::ClickHouse::Client.execute("SELECT 1 AS value", :main)
expect(result).to eq([{ 'value' => 1 }])
end
end
describe 'inserting', :click_house do
let_it_be(:group) { create(:group) }
let_it_be(:project) { create(:project) }
let_it_be(:author1) { create(:user).tap { |u| project.add_developer(u) } }
let_it_be(:author2) { create(:user).tap { |u| project.add_developer(u) } }
let_it_be(:issue1) { create(:issue, project: project) }
let_it_be(:issue2) { create(:issue, project: project) }
let_it_be(:merge_request) { create(:merge_request, source_project: project) }
let_it_be(:event1) { create(:event, :created, target: issue1, author: author1) }
let_it_be(:event2) { create(:event, :closed, target: issue2, author: author2) }
let_it_be(:event3) { create(:event, :merged, target: merge_request, author: author1) }
let(:events) { [event1, event2, event3] }
def format_row(event)
path = event.project.reload.project_namespace.traversal_ids.join('/')
action = Event.actions[event.action]
[
event.id,
"'#{path}/'",
event.author_id,
event.target_id,
"'#{event.target_type}'",
action,
event.created_at.to_f,
event.updated_at.to_f
].join(',')
end
describe 'RSpec hooks' do
it 'ensures that tables are empty' do
results = ClickHouse::Client.select('SELECT * FROM events', :main)
expect(results).to be_empty
end
end
it 'inserts and modifies data' do
insert_query = <<~SQL
INSERT INTO events
(id, path, author_id, target_id, target_type, action, created_at, updated_at)
VALUES
(#{format_row(event1)}),
(#{format_row(event2)}),
(#{format_row(event3)})
SQL
ClickHouse::Client.execute(insert_query, :main)
results = ClickHouse::Client.select('SELECT * FROM events ORDER BY id', :main)
expect(results.size).to eq(3)
last = results.last
expect(last).to match(a_hash_including(
'id' => event3.id,
'author_id' => event3.author_id,
'created_at' => be_within(0.05).of(event3.created_at),
'target_type' => event3.target_type
))
ClickHouse::Client.execute("DELETE FROM events WHERE id = #{event3.id}", :main)
results = ClickHouse::Client.select("SELECT * FROM events WHERE id = #{event3.id}", :main)
expect(results).to be_empty
end
end
describe 'querying data', :click_house do
it 'returns data from the DB' do
result = ClickHouse::Client.select('SELECT 1 AS value', :main)
expect(result).to eq([{ 'value' => 1 }])
end
end
end end
...@@ -345,6 +345,9 @@ ...@@ -345,6 +345,9 @@
# Keep-around refs should only be turned off for specific projects/repositories. # Keep-around refs should only be turned off for specific projects/repositories.
stub_feature_flags(disable_keep_around_refs: false) stub_feature_flags(disable_keep_around_refs: false)
# Postgres is the primary data source, and ClickHouse only when enabled in certain cases.
stub_feature_flags(clickhouse_data_collection: false)
allow(Gitlab::GitalyClient).to receive(:can_use_disk?).and_return(enable_rugged) allow(Gitlab::GitalyClient).to receive(:can_use_disk?).and_return(enable_rugged)
else else
unstub_all_feature_flags unstub_all_feature_flags
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment