Verified Commit 22dcd456 authored by Jean-Gabriel Doyon PTO until 2024-04-17's avatar Jean-Gabriel Doyon PTO until 2024-04-17 Committed by GitLab
Browse files

feat(testing): add datalake generator test tool

parent b5b40832
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ gl_synthetic_data/
gkg_simulator_manifest.json
crates/simulator/profiles
evaluation_results/
datalake-generator-state/

# Tilt secrets (contains passwords)
.tilt-secrets
+169 −43
Original line number Diff line number Diff line
@@ -113,7 +113,7 @@ version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
dependencies = [
 "windows-sys 0.60.2",
 "windows-sys 0.61.2",
]

[[package]]
@@ -124,7 +124,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
dependencies = [
 "anstyle",
 "once_cell_polyfill",
 "windows-sys 0.60.2",
 "windows-sys 0.61.2",
]

[[package]]
@@ -674,9 +674,9 @@ checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"

[[package]]
name = "bitflags"
version = "2.10.0"
version = "2.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"

[[package]]
name = "block-buffer"
@@ -852,9 +852,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

[[package]]
name = "cc"
version = "1.2.55"
version = "1.2.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29"
checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2"
dependencies = [
 "find-msvc-tools",
 "jobserver",
@@ -1179,6 +1179,15 @@ dependencies = [
 "libc",
]

[[package]]
name = "crc32fast"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
dependencies = [
 "cfg-if",
]

[[package]]
name = "criterion"
version = "0.8.2"
@@ -1983,6 +1992,29 @@ dependencies = [
 "sqlparser",
]

[[package]]
name = "datalake-generator"
version = "0.1.0"
dependencies = [
 "anyhow",
 "arrow",
 "chrono",
 "clap",
 "clickhouse-client",
 "flate2",
 "ontology",
 "opentelemetry",
 "rand 0.8.5",
 "serde",
 "serde_json",
 "serde_yaml",
 "sysinfo",
 "tokio",
 "tokio-util",
 "tracing",
 "tracing-subscriber",
]

[[package]]
name = "der"
version = "0.7.10"
@@ -1996,9 +2028,9 @@ dependencies = [

[[package]]
name = "deranged"
version = "0.5.5"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587"
checksum = "cc3dc5ad92c2e2d1c193bbbbdf2ea477cb81331de4f3103f267ca18368b988c4"
dependencies = [
 "powerfmt",
 "serde_core",
@@ -2376,6 +2408,7 @@ version = "1.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
dependencies = [
 "crc32fast",
 "miniz_oxide",
 "zlib-rs",
]
@@ -3275,9 +3308,9 @@ dependencies = [

[[package]]
name = "indicatif"
version = "0.18.3"
version = "0.18.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88"
checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
dependencies = [
 "console",
 "portable-atomic",
@@ -3364,9 +3397,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"

[[package]]
name = "jiff"
version = "0.2.19"
version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d89a5b5e10d5a9ad6e5d1f4bd58225f655d6fe9767575a5e8ac5a6fe64e04495"
checksum = "c867c356cc096b33f4981825ab281ecba3db0acefe60329f044c1789d94c6543"
dependencies = [
 "jiff-static",
 "log",
@@ -3377,9 +3410,9 @@ dependencies = [

[[package]]
name = "jiff-static"
version = "0.2.19"
version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff7a39c8862fc1369215ccf0a8f12dd4598c7f6484704359f0351bd617034dbf"
checksum = "f7946b4325269738f270bb55b3c19ab5c5040525f83fd625259422a9d25d9be5"
dependencies = [
 "proc-macro2",
 "quote",
@@ -3721,9 +3754,9 @@ dependencies = [

[[package]]
name = "libc"
version = "0.2.181"
version = "0.2.182"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "459427e2af2b9c839b132acb702a1c654d95e10f8c326bfc2ad11310e458b1c5"
checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112"

[[package]]
name = "libloading"
@@ -3759,7 +3792,7 @@ checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
dependencies = [
 "bitflags",
 "libc",
 "redox_syscall 0.7.0",
 "redox_syscall 0.7.1",
]

[[package]]
@@ -3932,6 +3965,15 @@ dependencies = [
 "minimal-lexical",
]

[[package]]
name = "ntapi"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae"
dependencies = [
 "winapi",
]

[[package]]
name = "nu-ansi-term"
version = "0.50.3"
@@ -4063,6 +4105,25 @@ dependencies = [
 "libc",
]

[[package]]
name = "objc2-core-foundation"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166"
dependencies = [
 "bitflags",
]

[[package]]
name = "objc2-io-kit"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a"
dependencies = [
 "libc",
 "objc2-core-foundation",
]

[[package]]
name = "object"
version = "0.37.3"
@@ -4773,7 +4834,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7"
dependencies = [
 "heck",
 "itertools 0.13.0",
 "itertools 0.14.0",
 "log",
 "multimap",
 "petgraph",
@@ -4794,7 +4855,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b"
dependencies = [
 "anyhow",
 "itertools 0.13.0",
 "itertools 0.14.0",
 "proc-macro2",
 "quote",
 "syn",
@@ -5101,9 +5162,9 @@ dependencies = [

[[package]]
name = "redox_syscall"
version = "0.7.0"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f3fe0889e69e2ae9e41f4d6c4c0181701d00e4697b356fb1f74173a5e0ee27"
checksum = "35985aa610addc02e24fc232012c86fd11f14111180f902b67e2d5331f8ebf2b"
dependencies = [
 "bitflags",
]
@@ -5441,7 +5502,7 @@ dependencies = [
 "openssl-probe 0.2.1",
 "rustls-pki-types",
 "schannel",
 "security-framework 3.5.1",
 "security-framework 3.6.0",
]

[[package]]
@@ -5478,7 +5539,7 @@ dependencies = [
 "rustls-native-certs 0.8.3",
 "rustls-platform-verifier-android",
 "rustls-webpki 0.103.9",
 "security-framework 3.5.1",
 "security-framework 3.6.0",
 "security-framework-sys",
 "webpki-root-certs",
 "windows-sys 0.61.2",
@@ -5637,9 +5698,9 @@ dependencies = [

[[package]]
name = "security-framework"
version = "3.5.1"
version = "3.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef"
checksum = "d17b898a6d6948c3a8ee4372c17cb384f90d2e6e912ef00895b14fd7ab54ec38"
dependencies = [
 "bitflags",
 "core-foundation 0.10.1",
@@ -5650,9 +5711,9 @@ dependencies = [

[[package]]
name = "security-framework-sys"
version = "2.15.0"
version = "2.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0"
checksum = "321c8673b092a9a42605034a9879d73cb79101ed5fd117bc9a597b89b4e9e61a"
dependencies = [
 "core-foundation-sys",
 "libc",
@@ -5938,9 +5999,9 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"

[[package]]
name = "simple_asn1"
version = "0.6.3"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb"
checksum = "0d585997b0ac10be3c5ee635f1bab02d512760d14b7c468801ac8a01d9ae5f1d"
dependencies = [
 "num-bigint",
 "num-traits",
@@ -6348,9 +6409,9 @@ dependencies = [

[[package]]
name = "syn"
version = "2.0.114"
version = "2.0.116"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb"
dependencies = [
 "proc-macro2",
 "quote",
@@ -6377,6 +6438,20 @@ dependencies = [
 "syn",
]

[[package]]
name = "sysinfo"
version = "0.38.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1efc19935b4b66baa6f654ac7924c192f55b175c00a7ab72410fc24284dacda8"
dependencies = [
 "libc",
 "memchr",
 "ntapi",
 "objc2-core-foundation",
 "objc2-io-kit",
 "windows",
]

[[package]]
name = "tabled"
version = "0.20.0"
@@ -6729,9 +6804,9 @@ dependencies = [

[[package]]
name = "toml_parser"
version = "1.0.7+spec-1.1.0"
version = "1.0.9+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "247eaa3197818b831697600aadf81514e577e0cba5eab10f7e064e78ae154df1"
checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4"
dependencies = [
 "winnow",
]
@@ -6796,9 +6871,9 @@ dependencies = [

[[package]]
name = "tonic-prost-build"
version = "0.14.3"
version = "0.14.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4556786613791cfef4ed134aa670b61a85cfcacf71543ef33e8d801abae988f"
checksum = "65873ace111e90344b8973e94a1fc817c924473affff24629281f90daed1cd2e"
dependencies = [
 "prettyplease",
 "proc-macro2",
@@ -6812,9 +6887,9 @@ dependencies = [

[[package]]
name = "toon-format"
version = "0.4.1"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349a10106f2c703fbfbe4a2eef6683af0acca60a0b8334c7a33795231dbf92d5"
checksum = "ee5b663187f1bbcad8232ce10e0ba75ba95b169a6126938768649239e7048a52"
dependencies = [
 "indexmap 2.13.0",
 "serde",
@@ -7150,9 +7225,9 @@ checksum = "81b79ad29b5e19de4260020f8919b443b2ef0277d242ce532ec7b7a2cc8b6007"

[[package]]
name = "unicode-ident"
version = "1.0.23"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e"
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"

[[package]]
name = "unicode-segmentation"
@@ -7507,6 +7582,27 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

[[package]]
name = "windows"
version = "0.62.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580"
dependencies = [
 "windows-collections",
 "windows-core",
 "windows-future",
 "windows-numerics",
]

[[package]]
name = "windows-collections"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610"
dependencies = [
 "windows-core",
]

[[package]]
name = "windows-core"
version = "0.62.2"
@@ -7520,6 +7616,17 @@ dependencies = [
 "windows-strings",
]

[[package]]
name = "windows-future"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb"
dependencies = [
 "windows-core",
 "windows-link",
 "windows-threading",
]

[[package]]
name = "windows-implement"
version = "0.60.2"
@@ -7548,6 +7655,16 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"

[[package]]
name = "windows-numerics"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26"
dependencies = [
 "windows-core",
 "windows-link",
]

[[package]]
name = "windows-result"
version = "0.4.1"
@@ -7659,6 +7776,15 @@ dependencies = [
 "windows_x86_64_msvc 0.53.1",
]

[[package]]
name = "windows-threading"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37"
dependencies = [
 "windows-link",
]

[[package]]
name = "windows_aarch64_gnullvm"
version = "0.42.2"
@@ -8021,15 +8147,15 @@ dependencies = [

[[package]]
name = "zlib-rs"
version = "0.6.0"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7948af682ccbc3342b6e9420e8c51c1fe5d7bf7756002b4a3c6cabfe96a7e3c"
checksum = "3a33bbf307b25a1774cee0687694ec72fa7814b3ab5c1c12a9d2fc6a36fc439c"

[[package]]
name = "zmij"
version = "1.0.20"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4de98dfa5d5b7fef4ee834d0073d560c9ca7b6c46a71d058c48db7960f8cfaf7"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"

[[package]]
name = "zstd"
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ members = [
  "crates/simulator",
  "crates/health-check",
  "crates/siphon-proto",
  "crates/datalake-generator",
]

[workspace.dependencies]
+29 −0
Original line number Diff line number Diff line
[package]
name = "datalake-generator"
version = "0.1.0"
edition = "2024"
license = "LicenseRef-EE"

[[bin]]
name = "datalake-generate"
path = "src/bin/generate.rs"

[dependencies]
anyhow.workspace = true
arrow.workspace = true
chrono.workspace = true
clap.workspace = true
flate2 = "1"
opentelemetry.workspace = true
rand = { version = "0.8", features = ["small_rng"] }
serde.workspace = true
serde_json.workspace = true
serde_yaml.workspace = true
sysinfo = "0.38.1"
tokio.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true

clickhouse-client = { path = "../clickhouse-client" }
ontology = { path = "../ontology" }
+284 −0
Original line number Diff line number Diff line
# Datalake Generator

> **Not for production use.** This tool generates synthetic data for development and testing only.

High-throughput seeding harness for ClickHouse tables used by the Knowledge Graph.

The generator has a single seeding architecture with:
- deterministic foundation generation (users, groups, projects),
- staged table writes with dependency ordering,
- optional continuous mode for ongoing insert/update/delete traffic.

## Quick start

```bash
cargo run --bin datalake-generate -- -c crates/datalake-generator/datalake-generator.yaml
```

## CLI

```bash
cargo run --bin datalake-generate -- [OPTIONS]
```

Options:
- `-c, --config <PATH>`: YAML config path (default `datalake-generator.yaml`)
- `--skip-seeding`: skip the initial seed and run from saved state

## Main flow

1. Build foundation entities from config.
2. Truncate stage tables that exist in the target ClickHouse schema.
3. Run staged writes in dependency order.
4. Persist state for continuous mode.
5. Optionally run continuous mode.
6. Write metrics report.

## Data generation pipeline

```
 OS threads (std::thread::scope)
┌──────────────────────────────────────────┐
│                                          │
│  ┌──────────┐ ┌──────────┐ ┌──────────┐  │
│  │ Table A  │ │ Table B  │ │ Table C  │  │
│  │ producer │ │ producer │ │ producer │  │
│  └────┬─────┘ └────┬─────┘ └────┬─────┘  │
│       │            │            │        │
└───────┼────────────┼────────────┼────────┘
        │            │            │
        ▼            ▼            ▼
  ┌────────────────────────────────────────┐
  │   sync_channel (bounded, capacity 16)  │
  │        Arrow RecordBatches             │
  └──────────────────┬─────────────────────┘


          ┌───────────────────────┐
          │  Consumer thread      │
          │  (spawn_blocking)     │
          │                       │
          │  tokio::spawn per     │
          │  batch -> ClickHouse  │
          │  HTTP insert          │
          └───────────────────────┘
```

Each stage (foundation, primary, secondary, leaf) runs this pipeline for its
tables. Within a stage, all tables generate rows in parallel.

## Graph structure

The generator builds a property graph that mirrors a GitLab instance's SDLC
data. The graph has three layers, each depending on the one above.

### Foundation layer

Built first, deterministically from config. These entities form the skeleton
that all project-scoped data hangs off of.

```
Organization (implicit, always id=1)
├── Users (flat list)
└── Groups (hierarchical)
    ├── Root Group 1
    │   ├── Subgroup 1a
    │   │   └── Subgroup 1a-i
    │   └── Subgroup 1b
    └── Root Group 2
        └── ...
    └── Projects (under each group)
```

Each group gets an entity ID, a namespace ID, and a traversal path
(`"1/2/3/"`) encoding its position in the hierarchy. Projects inherit
their parent group's namespace and path.

### Project-scoped entities

Generated per-project according to `per_project` counts in the config.
Every entity gets a synthetic ID, a `project_id`, a `namespace_id`,
and a `traversal_path` linking it back to its project.

Entities are written in four dependency-ordered stages:

| Stage | Tables | Depends on |
|-------|--------|------------|
| Foundation | `siphon_users`, `siphon_namespaces`, `siphon_namespace_details`, `namespace_traversal_paths`, `siphon_projects`, `project_namespace_traversal_paths`, `siphon_knowledge_graph_enabled_namespaces` | nothing |
| Primary | `hierarchy_merge_requests`, `hierarchy_work_items`, `siphon_issues`, `siphon_p_ci_pipelines`, `siphon_vulnerabilities`, `siphon_vulnerability_scanners`, `siphon_vulnerability_identifiers`, `siphon_vulnerability_occurrences`, `siphon_milestones`, `siphon_labels`, `siphon_members` | Foundation |
| Secondary | `siphon_notes`, `siphon_merge_request_diffs`, `siphon_p_ci_stages`, `siphon_security_scans`, `siphon_vulnerability_merge_request_links`, `siphon_merge_requests_closing_issues`, `siphon_work_item_parent_links`, `siphon_issue_links`, `siphon_vulnerability_occurrence_identifiers` | Primary |
| Leaf | `siphon_p_ci_builds`, `siphon_security_findings`, `siphon_merge_request_diff_files` | Secondary |

Within a stage, all tables generate in parallel. The next stage starts
only after the previous stage finishes.

### Relationships

Parent-child and cross-entity relationships are wired deterministically
using `map_child_to_parent_index`, which spreads children evenly across
parents: `parent_index = (child_index * parent_count) / child_count`.

Relationships expressed:

```
MergeRequest ──────── Project (target_project_id, source_project_id)
MergeRequest ──────── User (author_id)
WorkItem ──────────── User (author_id)
Note ─────────────┬── MergeRequest (noteable_id, split by ratio)
                  └── WorkItem (noteable_id)
MergeRequestDiff ──── MergeRequest (merge_request_id)
MergeRequestDiffFile ─ MergeRequestDiff (merge_request_diff_id)
Stage ─────────────── Pipeline (pipeline_id)
Job ───────────────── Stage (stage_id)
SecurityScan ──────── Pipeline + Job (pipeline_id, build_id)
SecurityFinding ───── SecurityScan + VulnerabilityScanner (scan_id, scanner_id)
Vulnerability ─────── User (author_id)
VulnerabilityOccurrence ─ Vulnerability + Scanner + Identifier
VulnerabilityMergeRequestLink ─ Vulnerability + MergeRequest
MergeRequestClosingIssue ──── MergeRequest + WorkItem
WorkItemParentLink ── WorkItem (parent) + WorkItem (child)
IssueLink ─────────── WorkItem (source) + WorkItem (target)
Member ────────────── Project + User (source_id, user_id)
```

Notes are split between MergeRequest and WorkItem parents proportionally
to their respective counts. Work item parent links form a flat hierarchy
(all children point to the first work item in the project).

## Synthetic ID scheme

IDs are computed arithmetically so that every row across every table gets a
globally unique, deterministic ID with no coordination or sequence counters.

```
table_id_base = next_entity_id + (table_position * block_size)
block_size    = project_count * max_rows_per_project + 1

row_id = table_id_base + (project_index * rows_per_project) + entity_index
```

`table_position` is a fixed ordinal for each project-scoped table (0 for
merge_requests, 1 for work_items, etc.), defined in `catalog.rs`. This
spreads each table's ID range into a non-overlapping block. Child tables
compute parent IDs using the parent table's base and the same formula,
so referential links are computed without lookups.

## Fake value generation

The generator auto-fills any column not explicitly set by a relationship
writer. Column names are classified by pattern matching into kinds, and
each kind produces plausible-looking synthetic data:

| Column pattern | Kind | Example output |
|---------------|------|----------------|
| `id`, `*_id` | Id | `42317` |
| `iid` | Iid | `1` through `10000` |
| `*email*` | Email | `user12ab@example.com` |
| `*url*` | Url | `https://example.com/a1b2/c3d4` |
| `*sha*`, `*hash*`, `*fingerprint*` | Sha | 40-char hex string |
| `*path*` | Path | `/p1a/d2b/c3d4e5f6` |
| `*name*`, `*title*` | Name | `alpha_a1b2c3d4` |
| `*description*`, `*body*`, `*note*` | Description | `Lorem ipsum dolor a1b2` |
| `*status*` | Status | `open`, `closed`, `merged`, `pending`, `active` |
| `*state*` | State | `pending`, `running`, `success`, `failed`, `canceled` |
| `*ref*`, `*branch*` | Branch | `feature/branch-a1b2` |
| `uuid`, `*_uuid` | Uuid | RFC 4122 format |
| `*_ids` | IdList | `[1, 2345, 6789]` |
| `*_at`, `created_at`, `updated_at` | DateTime | timestamp within last 5 years |
| anything else (string) | GenericString | `val<hex>` |

Nullable columns have a ~10% chance of being null.

### Field overrides

The `field_overrides` config section constrains generated values for
enum-like columns to valid domain values. Without overrides, status/state
columns pick from a small hardcoded pool. With overrides, the generator
picks uniformly from the provided list:

```yaml
field_overrides:
  MergeRequest:
    state_id: [1, 2, 3, 4]
    merge_status: ["unchecked", "can_be_merged", "cannot_be_merged"]
```

### Determinism

All generation is seeded. The base seed (default `42`) is XORed with a
per-table offset so each table gets a different random sequence while
remaining fully reproducible. The RNG is `SmallRng` seeded via
`seed_from_u64`, and the counter is mixed with the golden ratio hash
(`0x9e3779b97f4a7c15`) for better bit distribution.

## Limitations

**Single organization.** The generator always uses `organization_id = 1`.
Multi-org generation is not supported.

**No schema DDL.** The generator reads schemas from a running ClickHouse
instance. Tables must already exist; it does not create or migrate them.
Tables missing from ClickHouse are silently skipped.

**Uniform distributions.** Entities are spread evenly across parents.
Real GitLab data has power-law distributions (some projects have thousands
of MRs, most have few). The generator does not model this skew.

**Flat work item hierarchies.** Work item parent links all point to the
first work item in each project. Real work items form deeper trees.

**No cross-project relationships.** MergeRequest source and target
projects are always the same project. Forked-project MR workflows are
not modeled.

**No code data.** The generator covers SDLC entities only. It does not
produce code indexing data (call graphs, definitions, references).

**No temporal consistency.** Timestamps are generated independently per
column. A merge request's `merged_at` might precede its `created_at`.

**String content is synthetic.** Names, descriptions, and other text
fields are lorem-ipsum-style placeholders. They do not resemble real
GitLab content.

**Fixed column type support.** The generator handles `Int64`, `Int8`,
`Utf8`, `Boolean`, `Float64`, `Date32`, and `List<Int64>`. Any other
Arrow data type falls back to null.

**Truncation on re-run.** The seeding phase truncates all stage tables
before writing. Running the generator against a database with real data
would destroy it.

## Module structure

- `src/lib.rs`: top-level `run()` entrypoint that orchestrates the full pipeline
- `src/domain/`: domain model
  - `foundation.rs`: foundation entities and ID allocation
  - `layout.rs`: per-table row counts and synthetic ID helpers
- `src/seeding/`: seeding pipeline (what to generate and in what order)
  - `catalog.rs`: stage ordering and table metadata
  - `pipeline.rs`: concurrent batch generation and ClickHouse inserts
  - `state_builder.rs`: builds `HierarchyState` after seeding
- `src/data_generation/`: row-level building toolkit (how to construct Arrow batches)
  - `schema_registry.rs`: fetches and caches Arrow schemas from ClickHouse
  - `row_builder.rs`: `DirectBatchBuilder` for columnar Arrow array construction
  - `fake_values.rs`: deterministic fake value generation per column type
- `src/continuous.rs`: continuous insert/update/delete traffic after initial seed
- `src/state.rs`: compressed state persistence (`save` / `load`)

## Configuration

Use `crates/datalake-generator/datalake-generator.yaml` as the baseline.

Key sections:
- `datalake`: ClickHouse connection and database
- `generation`: batch size, root counts, per-project counts, field overrides
- `continuous`: continuous mode controls
- `metrics`: report output
- `state`: state directory

## Metrics output

When metrics are enabled, the generator writes:
- JSON report at `metrics.output_path`
- stdout summary with duration, table row counts, and resource usage
Loading