Skip to content

Migration for audit events table in ClickHouse

Harsimar Sandhu requested to merge 431681-create-audit-events-clickhouse into master

What does this MR do and why?

Migration for audit events table in ClickHouse

This commit adds migration for creating audit events table in ClickHouse database

Schema comparisons

In PostgreSQL below is audit_events schema:

CREATE TABLE audit_events (
  author_id integer NOT NULL, 
  author_name text, 
  created_at timestamp without time zone NOT NULL, 
  audit_events_pkey PRIMARY KEY (created_at), 
  details text, 
  entity_id integer NOT NULL, 
  entity_path text, 
  entity_type character varying NOT NULL, 
  id bigint NOT NULL, 
  audit_events_pkey PRIMARY KEY (id), 
  ip_address inet, 
  target_details text, 
  target_id bigint, 
  target_type text
);

I tried two different schemas for our table: one uses a composite primary key and the other uses projections. I found that the schema with the composite primary key is faster in giving back query results and uses less space on our disk.

Schema 1

CREATE TABLE audit_events
(
    id Int64 DEFAULT 0,
    author_id Int32 DEFAULT 0,
    author_name String DEFAULT '',
    created_at DateTime64(6, 'UTC') DEFAULT now(),
    details String DEFAULT '',
    entity_id Int32 DEFAULT 0,
    entity_path String DEFAULT '',
    entity_type String DEFAULT '',
    ip_address IPv4 DEFAULT '',
    target_details String DEFAULT '',
    target_id Int64 DEFAULT 0,
    target_type String DEFAULT '',
) ENGINE = ReplacingMergeTree()
PARTITION BY toYYYYMM(created_at)
PRIMARY KEY (entity_type, entity_id, author_id, id)
ORDER BY (entity_type, entity_id, author_id, id);

Query 1

SELECT *
FROM audit_events
WHERE entity_type = 'Group' AND entity_id = 84
   AND author_id = 23
   AND created_at >= '2023-01-01' AND created_at < '2023-02-01';

Elapsed: 0.006 sec, read 8.19 thousand rows, 277.25 KB.

Query 2

SELECT *
FROM audit_events
WHERE entity_type = 'Group' AND entity_id = 84
   AND created_at >= '2023-01-01' AND created_at < '2023-02-01';

Elapsed: 0.012 sec, read 16.38 thousand rows, 2.68 MB.

Query 3

SELECT *
FROM audit_events
WHERE entity_type = 'Group' AND entity_id = 2184
   AND created_at >= '2023-01-01' AND created_at < '2023-02-01';

Elapsed: 0.009 sec, read 8.19 thousand rows, 149.80 KB.

SELECT
    formatReadableSize(sum(bytes_on_disk))
FROM system.parts;

3.23 GiB

Schema 2

CREATE TABLE audit_events
(
    id Int64 DEFAULT 0,
    author_id Int32 DEFAULT 0,
    author_name String DEFAULT '',
    created_at DateTime64(6, 'UTC') DEFAULT now(),
    details String DEFAULT '',
    entity_id Int32 DEFAULT 0,
    entity_path String DEFAULT '',
    entity_type String DEFAULT '',
    ip_address IPv4 DEFAULT '',
    target_details String DEFAULT '',
    target_id Int64 DEFAULT 0,
    target_type String DEFAULT '',
) ENGINE = ReplacingMergeTree()
PARTITION BY toYYYYMM(created_at)
PRIMARY KEY id
ORDER BY id;

ALTER TABLE audit_events ADD PROJECTION author_id_projection (
    SELECT *
    ORDER BY author_id
);

ALTER TABLE audit_events ADD PROJECTION entity_projection (
    SELECT *
    ORDER BY entity_id, entity_type
);

ALTER TABLE audit_events MATERIALIZE PROJECTION author_id_projection;
ALTER TABLE audit_events MATERIALIZE PROJECTION entity_projection;

Query 1

SELECT *
FROM audit_events
WHERE entity_type = 'Group' AND entity_id = 84
   AND author_id = 23
   AND created_at >= '2023-01-01' AND created_at < '2023-02-01';

Elapsed: 0.015 sec, read 16.38 thousand rows, 1.38 MB.

Query 2

SELECT *
FROM audit_events
WHERE entity_type = 'Group' AND entity_id = 84
   AND created_at >= '2023-01-01' AND created_at < '2023-02-01';

Elapsed: 0.231 sec, read 16.38 thousand rows, 2.76 MB.

Query 3

SELECT *
FROM audit_events
WHERE entity_type = 'Group' AND entity_id = 2184
   AND created_at >= '2023-01-01' AND created_at < '2023-02-01';

Elapsed: 0.050 sec, read 8.19 thousand rows, 446.11 KB.

Disk usage

SELECT
    formatReadableSize(sum(bytes_on_disk))
FROM system.parts;

9.68 GiB

I performed the above analysis using the below queries where I created 30 million records in the database.



INSERT INTO audit_events (author_id, author_name, created_at, details, entity_id, entity_path, entity_type, id, ip_address, target_details, target_id, target_type)
SELECT
    (rand() % 100000) + 1 AS author_id,
    concat('Author_', toString((rand() % 100000) + 1)) AS author_name,
    now() - interval (rand() % 31536000) SECOND AS created_at,
    concat('Details_', toString((rand() % 1000000) + 1)) AS details,
    (rand() % 100000) + 1 AS entity_id,
    concat('/entity/', toString((rand() % 100000) + 1)) AS entity_path,
    if(rand() % 2 = 0, 'Project', 'Group') AS entity_type,
    number AS id,
    '127.0.0.1' AS ip_address,
    concat('TargetDetails_', toString((rand() % 1000000) + 1)) AS target_details,
    (rand() % 100000) + 1 AS target_id,
    concat('TargetType_', toString((rand() % 10) + 1)) AS target_type
FROM numbers(10000000);

INSERT INTO audit_events (author_id, author_name, created_at, details, entity_id, entity_path, entity_type, id, ip_address, target_details, target_id, target_type)
SELECT
    (rand() % 100000) + 1 AS author_id,
    concat('Author_', toString((rand() % 100000) + 1)) AS author_name,
    now() - interval (rand() % 31536000) SECOND AS created_at,
    concat('Details_', toString((rand() % 1000000) + 1)) AS details,
    84 AS entity_id,
    concat('/entity/', toString((rand() % 100000) + 1)) AS entity_path,
    'Group' AS entity_type,
    number + 10000000 AS id,
    '127.0.0.1' AS ip_address,
    concat('TargetDetails_', toString((rand() % 1000000) + 1)) AS target_details,
    (rand() % 100000) + 1 AS target_id,
    concat('TargetType_', toString((rand() % 10) + 1)) AS target_type
FROM numbers(10000000);


INSERT INTO audit_events (author_id, author_name, created_at, details, entity_id, entity_path, entity_type, id, ip_address, target_details, target_id, target_type)
SELECT
    (rand() % 100000) + 1 AS author_id,
    concat('Author_', toString((rand() % 100000) + 1)) AS author_name,
    now() - interval (rand() % 31536000) SECOND AS created_at,
    concat('Details_', toString((rand() % 1000000) + 1)) AS details,
    184 AS entity_id,
    concat('/entity/', toString((rand() % 100000) + 1)) AS entity_path,
    'Group' AS entity_type,
    number + 20000000 AS id,
    '127.0.0.1' AS ip_address,
    concat('TargetDetails_', toString((rand() % 1000000) + 1)) AS target_details,
    (rand() % 100000) + 1 AS target_id,
    concat('TargetType_', toString((rand() % 10) + 1)) AS target_type
FROM numbers(10000000);

MR acceptance checklist

This checklist encourages us to confirm any changes have been analyzed to reduce risks in quality, performance, reliability, security, and maintainability.

Related to #431681 (closed)

Edited by Harsimar Sandhu

Merge request reports