Loading .gitignore +1 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ gl_synthetic_data/ gkg_simulator_manifest.json crates/simulator/profiles evaluation_results/ datalake-generator-state/ # Tilt secrets (contains passwords) .tilt-secrets Loading Cargo.lock +169 −43 Original line number Diff line number Diff line Loading @@ -113,7 +113,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ "windows-sys 0.60.2", "windows-sys 0.61.2", ] [[package]] Loading @@ -124,7 +124,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", "windows-sys 0.60.2", "windows-sys 0.61.2", ] [[package]] Loading Loading @@ -674,9 +674,9 @@ checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] name = "bitflags" version = "2.10.0" version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "block-buffer" Loading Loading @@ -852,9 +852,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" version = "1.2.55" version = "1.2.56" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29" checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" dependencies = [ "find-msvc-tools", "jobserver", Loading Loading @@ -1179,6 +1179,15 @@ dependencies = [ "libc", ] [[package]] name = "crc32fast" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ "cfg-if", ] [[package]] name = "criterion" version = "0.8.2" Loading Loading @@ -1983,6 +1992,29 @@ dependencies = [ "sqlparser", ] [[package]] name = "datalake-generator" version = "0.1.0" dependencies = [ "anyhow", "arrow", "chrono", "clap", "clickhouse-client", "flate2", "ontology", "opentelemetry", "rand 0.8.5", "serde", "serde_json", "serde_yaml", "sysinfo", "tokio", "tokio-util", "tracing", "tracing-subscriber", ] [[package]] name = "der" version = "0.7.10" Loading @@ -1996,9 +2028,9 @@ dependencies = [ [[package]] name = "deranged" version = "0.5.5" version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" checksum = "cc3dc5ad92c2e2d1c193bbbbdf2ea477cb81331de4f3103f267ca18368b988c4" dependencies = [ "powerfmt", "serde_core", Loading Loading @@ -2376,6 +2408,7 @@ version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", "zlib-rs", ] Loading Loading @@ -3275,9 +3308,9 @@ dependencies = [ [[package]] name = "indicatif" version = "0.18.3" version = "0.18.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88" checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" dependencies = [ "console", "portable-atomic", Loading Loading @@ -3364,9 +3397,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" version = "0.2.19" version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d89a5b5e10d5a9ad6e5d1f4bd58225f655d6fe9767575a5e8ac5a6fe64e04495" checksum = "c867c356cc096b33f4981825ab281ecba3db0acefe60329f044c1789d94c6543" dependencies = [ "jiff-static", "log", Loading @@ -3377,9 +3410,9 @@ dependencies = [ [[package]] name = "jiff-static" version = "0.2.19" version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff7a39c8862fc1369215ccf0a8f12dd4598c7f6484704359f0351bd617034dbf" checksum = "f7946b4325269738f270bb55b3c19ab5c5040525f83fd625259422a9d25d9be5" dependencies = [ "proc-macro2", "quote", Loading Loading @@ -3721,9 +3754,9 @@ dependencies = [ [[package]] name = "libc" version = "0.2.181" version = "0.2.182" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "459427e2af2b9c839b132acb702a1c654d95e10f8c326bfc2ad11310e458b1c5" checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" [[package]] name = "libloading" Loading Loading @@ -3759,7 +3792,7 @@ checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" dependencies = [ "bitflags", "libc", "redox_syscall 0.7.0", "redox_syscall 0.7.1", ] [[package]] Loading Loading @@ -3932,6 +3965,15 @@ dependencies = [ "minimal-lexical", ] [[package]] name = "ntapi" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" dependencies = [ "winapi", ] [[package]] name = "nu-ansi-term" version = "0.50.3" Loading Loading @@ -4063,6 +4105,25 @@ dependencies = [ "libc", ] [[package]] name = "objc2-core-foundation" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" dependencies = [ "bitflags", ] [[package]] name = "objc2-io-kit" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a" dependencies = [ "libc", "objc2-core-foundation", ] [[package]] name = "object" version = "0.37.3" Loading Loading @@ -4773,7 +4834,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", "itertools 0.13.0", "itertools 0.14.0", "log", "multimap", "petgraph", Loading @@ -4794,7 +4855,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", "itertools 0.13.0", "itertools 0.14.0", "proc-macro2", "quote", "syn", Loading Loading @@ -5101,9 +5162,9 @@ dependencies = [ [[package]] name = "redox_syscall" version = "0.7.0" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f3fe0889e69e2ae9e41f4d6c4c0181701d00e4697b356fb1f74173a5e0ee27" checksum = "35985aa610addc02e24fc232012c86fd11f14111180f902b67e2d5331f8ebf2b" dependencies = [ "bitflags", ] Loading Loading @@ -5441,7 +5502,7 @@ dependencies = [ "openssl-probe 0.2.1", "rustls-pki-types", "schannel", "security-framework 3.5.1", "security-framework 3.6.0", ] [[package]] Loading Loading @@ -5478,7 +5539,7 @@ dependencies = [ "rustls-native-certs 0.8.3", "rustls-platform-verifier-android", "rustls-webpki 0.103.9", "security-framework 3.5.1", "security-framework 3.6.0", "security-framework-sys", "webpki-root-certs", "windows-sys 0.61.2", Loading Loading @@ -5637,9 +5698,9 @@ dependencies = [ [[package]] name = "security-framework" version = "3.5.1" version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" checksum = "d17b898a6d6948c3a8ee4372c17cb384f90d2e6e912ef00895b14fd7ab54ec38" dependencies = [ "bitflags", "core-foundation 0.10.1", Loading @@ -5650,9 +5711,9 @@ dependencies = [ [[package]] name = "security-framework-sys" version = "2.15.0" version = "2.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" checksum = "321c8673b092a9a42605034a9879d73cb79101ed5fd117bc9a597b89b4e9e61a" dependencies = [ "core-foundation-sys", "libc", Loading Loading @@ -5938,9 +5999,9 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "simple_asn1" version = "0.6.3" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" checksum = "0d585997b0ac10be3c5ee635f1bab02d512760d14b7c468801ac8a01d9ae5f1d" dependencies = [ "num-bigint", "num-traits", Loading Loading @@ -6348,9 +6409,9 @@ dependencies = [ [[package]] name = "syn" version = "2.0.114" version = "2.0.116" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb" dependencies = [ "proc-macro2", "quote", Loading @@ -6377,6 +6438,20 @@ dependencies = [ "syn", ] [[package]] name = "sysinfo" version = "0.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efc19935b4b66baa6f654ac7924c192f55b175c00a7ab72410fc24284dacda8" dependencies = [ "libc", "memchr", "ntapi", "objc2-core-foundation", "objc2-io-kit", "windows", ] [[package]] name = "tabled" version = "0.20.0" Loading Loading @@ -6729,9 +6804,9 @@ dependencies = [ [[package]] name = "toml_parser" version = "1.0.7+spec-1.1.0" version = "1.0.9+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "247eaa3197818b831697600aadf81514e577e0cba5eab10f7e064e78ae154df1" checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" dependencies = [ "winnow", ] Loading Loading @@ -6796,9 +6871,9 @@ dependencies = [ [[package]] name = "tonic-prost-build" version = "0.14.3" version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4556786613791cfef4ed134aa670b61a85cfcacf71543ef33e8d801abae988f" checksum = "65873ace111e90344b8973e94a1fc817c924473affff24629281f90daed1cd2e" dependencies = [ "prettyplease", "proc-macro2", Loading @@ -6812,9 +6887,9 @@ dependencies = [ [[package]] name = "toon-format" version = "0.4.1" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349a10106f2c703fbfbe4a2eef6683af0acca60a0b8334c7a33795231dbf92d5" checksum = "ee5b663187f1bbcad8232ce10e0ba75ba95b169a6126938768649239e7048a52" dependencies = [ "indexmap 2.13.0", "serde", Loading Loading @@ -7150,9 +7225,9 @@ checksum = "81b79ad29b5e19de4260020f8919b443b2ef0277d242ce532ec7b7a2cc8b6007" [[package]] name = "unicode-ident" version = "1.0.23" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" Loading Loading @@ -7507,6 +7582,27 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows" version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580" dependencies = [ "windows-collections", "windows-core", "windows-future", "windows-numerics", ] [[package]] name = "windows-collections" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610" dependencies = [ "windows-core", ] [[package]] name = "windows-core" version = "0.62.2" Loading @@ -7520,6 +7616,17 @@ dependencies = [ "windows-strings", ] [[package]] name = "windows-future" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb" dependencies = [ "windows-core", "windows-link", "windows-threading", ] [[package]] name = "windows-implement" version = "0.60.2" Loading Loading @@ -7548,6 +7655,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] name = "windows-numerics" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26" dependencies = [ "windows-core", "windows-link", ] [[package]] name = "windows-result" version = "0.4.1" Loading Loading @@ -7659,6 +7776,15 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] [[package]] name = "windows-threading" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37" dependencies = [ "windows-link", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" Loading Loading @@ -8021,15 +8147,15 @@ dependencies = [ [[package]] name = "zlib-rs" version = "0.6.0" version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7948af682ccbc3342b6e9420e8c51c1fe5d7bf7756002b4a3c6cabfe96a7e3c" checksum = "3a33bbf307b25a1774cee0687694ec72fa7814b3ab5c1c12a9d2fc6a36fc439c" [[package]] name = "zmij" version = "1.0.20" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4de98dfa5d5b7fef4ee834d0073d560c9ca7b6c46a71d058c48db7960f8cfaf7" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" [[package]] name = "zstd" Loading Cargo.toml +1 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ members = [ "crates/simulator", "crates/health-check", "crates/siphon-proto", "crates/datalake-generator", ] [workspace.dependencies] Loading crates/datalake-generator/Cargo.toml 0 → 100644 +29 −0 Original line number Diff line number Diff line [package] name = "datalake-generator" version = "0.1.0" edition = "2024" license = "LicenseRef-EE" [[bin]] name = "datalake-generate" path = "src/bin/generate.rs" [dependencies] anyhow.workspace = true arrow.workspace = true chrono.workspace = true clap.workspace = true flate2 = "1" opentelemetry.workspace = true rand = { version = "0.8", features = ["small_rng"] } serde.workspace = true serde_json.workspace = true serde_yaml.workspace = true sysinfo = "0.38.1" tokio.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-subscriber.workspace = true clickhouse-client = { path = "../clickhouse-client" } ontology = { path = "../ontology" } crates/datalake-generator/README.md 0 → 100644 +284 −0 Original line number Diff line number Diff line # Datalake Generator > **Not for production use.** This tool generates synthetic data for development and testing only. High-throughput seeding harness for ClickHouse tables used by the Knowledge Graph. The generator has a single seeding architecture with: - deterministic foundation generation (users, groups, projects), - staged table writes with dependency ordering, - optional continuous mode for ongoing insert/update/delete traffic. ## Quick start ```bash cargo run --bin datalake-generate -- -c crates/datalake-generator/datalake-generator.yaml ``` ## CLI ```bash cargo run --bin datalake-generate -- [OPTIONS] ``` Options: - `-c, --config <PATH>`: YAML config path (default `datalake-generator.yaml`) - `--skip-seeding`: skip the initial seed and run from saved state ## Main flow 1. Build foundation entities from config. 2. Truncate stage tables that exist in the target ClickHouse schema. 3. Run staged writes in dependency order. 4. Persist state for continuous mode. 5. Optionally run continuous mode. 6. Write metrics report. ## Data generation pipeline ``` OS threads (std::thread::scope) ┌──────────────────────────────────────────┐ │ │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ │ Table A │ │ Table B │ │ Table C │ │ │ │ producer │ │ producer │ │ producer │ │ │ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ │ │ │ │ │ └───────┼────────────┼────────────┼────────┘ │ │ │ ▼ ▼ ▼ ┌────────────────────────────────────────┐ │ sync_channel (bounded, capacity 16) │ │ Arrow RecordBatches │ └──────────────────┬─────────────────────┘ │ ▼ ┌───────────────────────┐ │ Consumer thread │ │ (spawn_blocking) │ │ │ │ tokio::spawn per │ │ batch -> ClickHouse │ │ HTTP insert │ └───────────────────────┘ ``` Each stage (foundation, primary, secondary, leaf) runs this pipeline for its tables. Within a stage, all tables generate rows in parallel. ## Graph structure The generator builds a property graph that mirrors a GitLab instance's SDLC data. The graph has three layers, each depending on the one above. ### Foundation layer Built first, deterministically from config. These entities form the skeleton that all project-scoped data hangs off of. ``` Organization (implicit, always id=1) ├── Users (flat list) └── Groups (hierarchical) ├── Root Group 1 │ ├── Subgroup 1a │ │ └── Subgroup 1a-i │ └── Subgroup 1b └── Root Group 2 └── ... └── Projects (under each group) ``` Each group gets an entity ID, a namespace ID, and a traversal path (`"1/2/3/"`) encoding its position in the hierarchy. Projects inherit their parent group's namespace and path. ### Project-scoped entities Generated per-project according to `per_project` counts in the config. Every entity gets a synthetic ID, a `project_id`, a `namespace_id`, and a `traversal_path` linking it back to its project. Entities are written in four dependency-ordered stages: | Stage | Tables | Depends on | |-------|--------|------------| | Foundation | `siphon_users`, `siphon_namespaces`, `siphon_namespace_details`, `namespace_traversal_paths`, `siphon_projects`, `project_namespace_traversal_paths`, `siphon_knowledge_graph_enabled_namespaces` | nothing | | Primary | `hierarchy_merge_requests`, `hierarchy_work_items`, `siphon_issues`, `siphon_p_ci_pipelines`, `siphon_vulnerabilities`, `siphon_vulnerability_scanners`, `siphon_vulnerability_identifiers`, `siphon_vulnerability_occurrences`, `siphon_milestones`, `siphon_labels`, `siphon_members` | Foundation | | Secondary | `siphon_notes`, `siphon_merge_request_diffs`, `siphon_p_ci_stages`, `siphon_security_scans`, `siphon_vulnerability_merge_request_links`, `siphon_merge_requests_closing_issues`, `siphon_work_item_parent_links`, `siphon_issue_links`, `siphon_vulnerability_occurrence_identifiers` | Primary | | Leaf | `siphon_p_ci_builds`, `siphon_security_findings`, `siphon_merge_request_diff_files` | Secondary | Within a stage, all tables generate in parallel. The next stage starts only after the previous stage finishes. ### Relationships Parent-child and cross-entity relationships are wired deterministically using `map_child_to_parent_index`, which spreads children evenly across parents: `parent_index = (child_index * parent_count) / child_count`. Relationships expressed: ``` MergeRequest ──────── Project (target_project_id, source_project_id) MergeRequest ──────── User (author_id) WorkItem ──────────── User (author_id) Note ─────────────┬── MergeRequest (noteable_id, split by ratio) └── WorkItem (noteable_id) MergeRequestDiff ──── MergeRequest (merge_request_id) MergeRequestDiffFile ─ MergeRequestDiff (merge_request_diff_id) Stage ─────────────── Pipeline (pipeline_id) Job ───────────────── Stage (stage_id) SecurityScan ──────── Pipeline + Job (pipeline_id, build_id) SecurityFinding ───── SecurityScan + VulnerabilityScanner (scan_id, scanner_id) Vulnerability ─────── User (author_id) VulnerabilityOccurrence ─ Vulnerability + Scanner + Identifier VulnerabilityMergeRequestLink ─ Vulnerability + MergeRequest MergeRequestClosingIssue ──── MergeRequest + WorkItem WorkItemParentLink ── WorkItem (parent) + WorkItem (child) IssueLink ─────────── WorkItem (source) + WorkItem (target) Member ────────────── Project + User (source_id, user_id) ``` Notes are split between MergeRequest and WorkItem parents proportionally to their respective counts. Work item parent links form a flat hierarchy (all children point to the first work item in the project). ## Synthetic ID scheme IDs are computed arithmetically so that every row across every table gets a globally unique, deterministic ID with no coordination or sequence counters. ``` table_id_base = next_entity_id + (table_position * block_size) block_size = project_count * max_rows_per_project + 1 row_id = table_id_base + (project_index * rows_per_project) + entity_index ``` `table_position` is a fixed ordinal for each project-scoped table (0 for merge_requests, 1 for work_items, etc.), defined in `catalog.rs`. This spreads each table's ID range into a non-overlapping block. Child tables compute parent IDs using the parent table's base and the same formula, so referential links are computed without lookups. ## Fake value generation The generator auto-fills any column not explicitly set by a relationship writer. Column names are classified by pattern matching into kinds, and each kind produces plausible-looking synthetic data: | Column pattern | Kind | Example output | |---------------|------|----------------| | `id`, `*_id` | Id | `42317` | | `iid` | Iid | `1` through `10000` | | `*email*` | Email | `user12ab@example.com` | | `*url*` | Url | `https://example.com/a1b2/c3d4` | | `*sha*`, `*hash*`, `*fingerprint*` | Sha | 40-char hex string | | `*path*` | Path | `/p1a/d2b/c3d4e5f6` | | `*name*`, `*title*` | Name | `alpha_a1b2c3d4` | | `*description*`, `*body*`, `*note*` | Description | `Lorem ipsum dolor a1b2` | | `*status*` | Status | `open`, `closed`, `merged`, `pending`, `active` | | `*state*` | State | `pending`, `running`, `success`, `failed`, `canceled` | | `*ref*`, `*branch*` | Branch | `feature/branch-a1b2` | | `uuid`, `*_uuid` | Uuid | RFC 4122 format | | `*_ids` | IdList | `[1, 2345, 6789]` | | `*_at`, `created_at`, `updated_at` | DateTime | timestamp within last 5 years | | anything else (string) | GenericString | `val<hex>` | Nullable columns have a ~10% chance of being null. ### Field overrides The `field_overrides` config section constrains generated values for enum-like columns to valid domain values. Without overrides, status/state columns pick from a small hardcoded pool. With overrides, the generator picks uniformly from the provided list: ```yaml field_overrides: MergeRequest: state_id: [1, 2, 3, 4] merge_status: ["unchecked", "can_be_merged", "cannot_be_merged"] ``` ### Determinism All generation is seeded. The base seed (default `42`) is XORed with a per-table offset so each table gets a different random sequence while remaining fully reproducible. The RNG is `SmallRng` seeded via `seed_from_u64`, and the counter is mixed with the golden ratio hash (`0x9e3779b97f4a7c15`) for better bit distribution. ## Limitations **Single organization.** The generator always uses `organization_id = 1`. Multi-org generation is not supported. **No schema DDL.** The generator reads schemas from a running ClickHouse instance. Tables must already exist; it does not create or migrate them. Tables missing from ClickHouse are silently skipped. **Uniform distributions.** Entities are spread evenly across parents. Real GitLab data has power-law distributions (some projects have thousands of MRs, most have few). The generator does not model this skew. **Flat work item hierarchies.** Work item parent links all point to the first work item in each project. Real work items form deeper trees. **No cross-project relationships.** MergeRequest source and target projects are always the same project. Forked-project MR workflows are not modeled. **No code data.** The generator covers SDLC entities only. It does not produce code indexing data (call graphs, definitions, references). **No temporal consistency.** Timestamps are generated independently per column. A merge request's `merged_at` might precede its `created_at`. **String content is synthetic.** Names, descriptions, and other text fields are lorem-ipsum-style placeholders. They do not resemble real GitLab content. **Fixed column type support.** The generator handles `Int64`, `Int8`, `Utf8`, `Boolean`, `Float64`, `Date32`, and `List<Int64>`. Any other Arrow data type falls back to null. **Truncation on re-run.** The seeding phase truncates all stage tables before writing. Running the generator against a database with real data would destroy it. ## Module structure - `src/lib.rs`: top-level `run()` entrypoint that orchestrates the full pipeline - `src/domain/`: domain model - `foundation.rs`: foundation entities and ID allocation - `layout.rs`: per-table row counts and synthetic ID helpers - `src/seeding/`: seeding pipeline (what to generate and in what order) - `catalog.rs`: stage ordering and table metadata - `pipeline.rs`: concurrent batch generation and ClickHouse inserts - `state_builder.rs`: builds `HierarchyState` after seeding - `src/data_generation/`: row-level building toolkit (how to construct Arrow batches) - `schema_registry.rs`: fetches and caches Arrow schemas from ClickHouse - `row_builder.rs`: `DirectBatchBuilder` for columnar Arrow array construction - `fake_values.rs`: deterministic fake value generation per column type - `src/continuous.rs`: continuous insert/update/delete traffic after initial seed - `src/state.rs`: compressed state persistence (`save` / `load`) ## Configuration Use `crates/datalake-generator/datalake-generator.yaml` as the baseline. Key sections: - `datalake`: ClickHouse connection and database - `generation`: batch size, root counts, per-project counts, field overrides - `continuous`: continuous mode controls - `metrics`: report output - `state`: state directory ## Metrics output When metrics are enabled, the generator writes: - JSON report at `metrics.output_path` - stdout summary with duration, table row counts, and resource usage Loading
.gitignore +1 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ gl_synthetic_data/ gkg_simulator_manifest.json crates/simulator/profiles evaluation_results/ datalake-generator-state/ # Tilt secrets (contains passwords) .tilt-secrets Loading
Cargo.lock +169 −43 Original line number Diff line number Diff line Loading @@ -113,7 +113,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ "windows-sys 0.60.2", "windows-sys 0.61.2", ] [[package]] Loading @@ -124,7 +124,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", "windows-sys 0.60.2", "windows-sys 0.61.2", ] [[package]] Loading Loading @@ -674,9 +674,9 @@ checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] name = "bitflags" version = "2.10.0" version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "block-buffer" Loading Loading @@ -852,9 +852,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" version = "1.2.55" version = "1.2.56" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29" checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" dependencies = [ "find-msvc-tools", "jobserver", Loading Loading @@ -1179,6 +1179,15 @@ dependencies = [ "libc", ] [[package]] name = "crc32fast" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ "cfg-if", ] [[package]] name = "criterion" version = "0.8.2" Loading Loading @@ -1983,6 +1992,29 @@ dependencies = [ "sqlparser", ] [[package]] name = "datalake-generator" version = "0.1.0" dependencies = [ "anyhow", "arrow", "chrono", "clap", "clickhouse-client", "flate2", "ontology", "opentelemetry", "rand 0.8.5", "serde", "serde_json", "serde_yaml", "sysinfo", "tokio", "tokio-util", "tracing", "tracing-subscriber", ] [[package]] name = "der" version = "0.7.10" Loading @@ -1996,9 +2028,9 @@ dependencies = [ [[package]] name = "deranged" version = "0.5.5" version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" checksum = "cc3dc5ad92c2e2d1c193bbbbdf2ea477cb81331de4f3103f267ca18368b988c4" dependencies = [ "powerfmt", "serde_core", Loading Loading @@ -2376,6 +2408,7 @@ version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", "zlib-rs", ] Loading Loading @@ -3275,9 +3308,9 @@ dependencies = [ [[package]] name = "indicatif" version = "0.18.3" version = "0.18.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88" checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" dependencies = [ "console", "portable-atomic", Loading Loading @@ -3364,9 +3397,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" version = "0.2.19" version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d89a5b5e10d5a9ad6e5d1f4bd58225f655d6fe9767575a5e8ac5a6fe64e04495" checksum = "c867c356cc096b33f4981825ab281ecba3db0acefe60329f044c1789d94c6543" dependencies = [ "jiff-static", "log", Loading @@ -3377,9 +3410,9 @@ dependencies = [ [[package]] name = "jiff-static" version = "0.2.19" version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff7a39c8862fc1369215ccf0a8f12dd4598c7f6484704359f0351bd617034dbf" checksum = "f7946b4325269738f270bb55b3c19ab5c5040525f83fd625259422a9d25d9be5" dependencies = [ "proc-macro2", "quote", Loading Loading @@ -3721,9 +3754,9 @@ dependencies = [ [[package]] name = "libc" version = "0.2.181" version = "0.2.182" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "459427e2af2b9c839b132acb702a1c654d95e10f8c326bfc2ad11310e458b1c5" checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" [[package]] name = "libloading" Loading Loading @@ -3759,7 +3792,7 @@ checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" dependencies = [ "bitflags", "libc", "redox_syscall 0.7.0", "redox_syscall 0.7.1", ] [[package]] Loading Loading @@ -3932,6 +3965,15 @@ dependencies = [ "minimal-lexical", ] [[package]] name = "ntapi" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" dependencies = [ "winapi", ] [[package]] name = "nu-ansi-term" version = "0.50.3" Loading Loading @@ -4063,6 +4105,25 @@ dependencies = [ "libc", ] [[package]] name = "objc2-core-foundation" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" dependencies = [ "bitflags", ] [[package]] name = "objc2-io-kit" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a" dependencies = [ "libc", "objc2-core-foundation", ] [[package]] name = "object" version = "0.37.3" Loading Loading @@ -4773,7 +4834,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", "itertools 0.13.0", "itertools 0.14.0", "log", "multimap", "petgraph", Loading @@ -4794,7 +4855,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", "itertools 0.13.0", "itertools 0.14.0", "proc-macro2", "quote", "syn", Loading Loading @@ -5101,9 +5162,9 @@ dependencies = [ [[package]] name = "redox_syscall" version = "0.7.0" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f3fe0889e69e2ae9e41f4d6c4c0181701d00e4697b356fb1f74173a5e0ee27" checksum = "35985aa610addc02e24fc232012c86fd11f14111180f902b67e2d5331f8ebf2b" dependencies = [ "bitflags", ] Loading Loading @@ -5441,7 +5502,7 @@ dependencies = [ "openssl-probe 0.2.1", "rustls-pki-types", "schannel", "security-framework 3.5.1", "security-framework 3.6.0", ] [[package]] Loading Loading @@ -5478,7 +5539,7 @@ dependencies = [ "rustls-native-certs 0.8.3", "rustls-platform-verifier-android", "rustls-webpki 0.103.9", "security-framework 3.5.1", "security-framework 3.6.0", "security-framework-sys", "webpki-root-certs", "windows-sys 0.61.2", Loading Loading @@ -5637,9 +5698,9 @@ dependencies = [ [[package]] name = "security-framework" version = "3.5.1" version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" checksum = "d17b898a6d6948c3a8ee4372c17cb384f90d2e6e912ef00895b14fd7ab54ec38" dependencies = [ "bitflags", "core-foundation 0.10.1", Loading @@ -5650,9 +5711,9 @@ dependencies = [ [[package]] name = "security-framework-sys" version = "2.15.0" version = "2.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" checksum = "321c8673b092a9a42605034a9879d73cb79101ed5fd117bc9a597b89b4e9e61a" dependencies = [ "core-foundation-sys", "libc", Loading Loading @@ -5938,9 +5999,9 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "simple_asn1" version = "0.6.3" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" checksum = "0d585997b0ac10be3c5ee635f1bab02d512760d14b7c468801ac8a01d9ae5f1d" dependencies = [ "num-bigint", "num-traits", Loading Loading @@ -6348,9 +6409,9 @@ dependencies = [ [[package]] name = "syn" version = "2.0.114" version = "2.0.116" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb" dependencies = [ "proc-macro2", "quote", Loading @@ -6377,6 +6438,20 @@ dependencies = [ "syn", ] [[package]] name = "sysinfo" version = "0.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efc19935b4b66baa6f654ac7924c192f55b175c00a7ab72410fc24284dacda8" dependencies = [ "libc", "memchr", "ntapi", "objc2-core-foundation", "objc2-io-kit", "windows", ] [[package]] name = "tabled" version = "0.20.0" Loading Loading @@ -6729,9 +6804,9 @@ dependencies = [ [[package]] name = "toml_parser" version = "1.0.7+spec-1.1.0" version = "1.0.9+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "247eaa3197818b831697600aadf81514e577e0cba5eab10f7e064e78ae154df1" checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" dependencies = [ "winnow", ] Loading Loading @@ -6796,9 +6871,9 @@ dependencies = [ [[package]] name = "tonic-prost-build" version = "0.14.3" version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4556786613791cfef4ed134aa670b61a85cfcacf71543ef33e8d801abae988f" checksum = "65873ace111e90344b8973e94a1fc817c924473affff24629281f90daed1cd2e" dependencies = [ "prettyplease", "proc-macro2", Loading @@ -6812,9 +6887,9 @@ dependencies = [ [[package]] name = "toon-format" version = "0.4.1" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349a10106f2c703fbfbe4a2eef6683af0acca60a0b8334c7a33795231dbf92d5" checksum = "ee5b663187f1bbcad8232ce10e0ba75ba95b169a6126938768649239e7048a52" dependencies = [ "indexmap 2.13.0", "serde", Loading Loading @@ -7150,9 +7225,9 @@ checksum = "81b79ad29b5e19de4260020f8919b443b2ef0277d242ce532ec7b7a2cc8b6007" [[package]] name = "unicode-ident" version = "1.0.23" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" Loading Loading @@ -7507,6 +7582,27 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows" version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580" dependencies = [ "windows-collections", "windows-core", "windows-future", "windows-numerics", ] [[package]] name = "windows-collections" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610" dependencies = [ "windows-core", ] [[package]] name = "windows-core" version = "0.62.2" Loading @@ -7520,6 +7616,17 @@ dependencies = [ "windows-strings", ] [[package]] name = "windows-future" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb" dependencies = [ "windows-core", "windows-link", "windows-threading", ] [[package]] name = "windows-implement" version = "0.60.2" Loading Loading @@ -7548,6 +7655,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] name = "windows-numerics" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26" dependencies = [ "windows-core", "windows-link", ] [[package]] name = "windows-result" version = "0.4.1" Loading Loading @@ -7659,6 +7776,15 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] [[package]] name = "windows-threading" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37" dependencies = [ "windows-link", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" Loading Loading @@ -8021,15 +8147,15 @@ dependencies = [ [[package]] name = "zlib-rs" version = "0.6.0" version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7948af682ccbc3342b6e9420e8c51c1fe5d7bf7756002b4a3c6cabfe96a7e3c" checksum = "3a33bbf307b25a1774cee0687694ec72fa7814b3ab5c1c12a9d2fc6a36fc439c" [[package]] name = "zmij" version = "1.0.20" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4de98dfa5d5b7fef4ee834d0073d560c9ca7b6c46a71d058c48db7960f8cfaf7" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" [[package]] name = "zstd" Loading
Cargo.toml +1 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ members = [ "crates/simulator", "crates/health-check", "crates/siphon-proto", "crates/datalake-generator", ] [workspace.dependencies] Loading
crates/datalake-generator/Cargo.toml 0 → 100644 +29 −0 Original line number Diff line number Diff line [package] name = "datalake-generator" version = "0.1.0" edition = "2024" license = "LicenseRef-EE" [[bin]] name = "datalake-generate" path = "src/bin/generate.rs" [dependencies] anyhow.workspace = true arrow.workspace = true chrono.workspace = true clap.workspace = true flate2 = "1" opentelemetry.workspace = true rand = { version = "0.8", features = ["small_rng"] } serde.workspace = true serde_json.workspace = true serde_yaml.workspace = true sysinfo = "0.38.1" tokio.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-subscriber.workspace = true clickhouse-client = { path = "../clickhouse-client" } ontology = { path = "../ontology" }
crates/datalake-generator/README.md 0 → 100644 +284 −0 Original line number Diff line number Diff line # Datalake Generator > **Not for production use.** This tool generates synthetic data for development and testing only. High-throughput seeding harness for ClickHouse tables used by the Knowledge Graph. The generator has a single seeding architecture with: - deterministic foundation generation (users, groups, projects), - staged table writes with dependency ordering, - optional continuous mode for ongoing insert/update/delete traffic. ## Quick start ```bash cargo run --bin datalake-generate -- -c crates/datalake-generator/datalake-generator.yaml ``` ## CLI ```bash cargo run --bin datalake-generate -- [OPTIONS] ``` Options: - `-c, --config <PATH>`: YAML config path (default `datalake-generator.yaml`) - `--skip-seeding`: skip the initial seed and run from saved state ## Main flow 1. Build foundation entities from config. 2. Truncate stage tables that exist in the target ClickHouse schema. 3. Run staged writes in dependency order. 4. Persist state for continuous mode. 5. Optionally run continuous mode. 6. Write metrics report. ## Data generation pipeline ``` OS threads (std::thread::scope) ┌──────────────────────────────────────────┐ │ │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ │ Table A │ │ Table B │ │ Table C │ │ │ │ producer │ │ producer │ │ producer │ │ │ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ │ │ │ │ │ └───────┼────────────┼────────────┼────────┘ │ │ │ ▼ ▼ ▼ ┌────────────────────────────────────────┐ │ sync_channel (bounded, capacity 16) │ │ Arrow RecordBatches │ └──────────────────┬─────────────────────┘ │ ▼ ┌───────────────────────┐ │ Consumer thread │ │ (spawn_blocking) │ │ │ │ tokio::spawn per │ │ batch -> ClickHouse │ │ HTTP insert │ └───────────────────────┘ ``` Each stage (foundation, primary, secondary, leaf) runs this pipeline for its tables. Within a stage, all tables generate rows in parallel. ## Graph structure The generator builds a property graph that mirrors a GitLab instance's SDLC data. The graph has three layers, each depending on the one above. ### Foundation layer Built first, deterministically from config. These entities form the skeleton that all project-scoped data hangs off of. ``` Organization (implicit, always id=1) ├── Users (flat list) └── Groups (hierarchical) ├── Root Group 1 │ ├── Subgroup 1a │ │ └── Subgroup 1a-i │ └── Subgroup 1b └── Root Group 2 └── ... └── Projects (under each group) ``` Each group gets an entity ID, a namespace ID, and a traversal path (`"1/2/3/"`) encoding its position in the hierarchy. Projects inherit their parent group's namespace and path. ### Project-scoped entities Generated per-project according to `per_project` counts in the config. Every entity gets a synthetic ID, a `project_id`, a `namespace_id`, and a `traversal_path` linking it back to its project. Entities are written in four dependency-ordered stages: | Stage | Tables | Depends on | |-------|--------|------------| | Foundation | `siphon_users`, `siphon_namespaces`, `siphon_namespace_details`, `namespace_traversal_paths`, `siphon_projects`, `project_namespace_traversal_paths`, `siphon_knowledge_graph_enabled_namespaces` | nothing | | Primary | `hierarchy_merge_requests`, `hierarchy_work_items`, `siphon_issues`, `siphon_p_ci_pipelines`, `siphon_vulnerabilities`, `siphon_vulnerability_scanners`, `siphon_vulnerability_identifiers`, `siphon_vulnerability_occurrences`, `siphon_milestones`, `siphon_labels`, `siphon_members` | Foundation | | Secondary | `siphon_notes`, `siphon_merge_request_diffs`, `siphon_p_ci_stages`, `siphon_security_scans`, `siphon_vulnerability_merge_request_links`, `siphon_merge_requests_closing_issues`, `siphon_work_item_parent_links`, `siphon_issue_links`, `siphon_vulnerability_occurrence_identifiers` | Primary | | Leaf | `siphon_p_ci_builds`, `siphon_security_findings`, `siphon_merge_request_diff_files` | Secondary | Within a stage, all tables generate in parallel. The next stage starts only after the previous stage finishes. ### Relationships Parent-child and cross-entity relationships are wired deterministically using `map_child_to_parent_index`, which spreads children evenly across parents: `parent_index = (child_index * parent_count) / child_count`. Relationships expressed: ``` MergeRequest ──────── Project (target_project_id, source_project_id) MergeRequest ──────── User (author_id) WorkItem ──────────── User (author_id) Note ─────────────┬── MergeRequest (noteable_id, split by ratio) └── WorkItem (noteable_id) MergeRequestDiff ──── MergeRequest (merge_request_id) MergeRequestDiffFile ─ MergeRequestDiff (merge_request_diff_id) Stage ─────────────── Pipeline (pipeline_id) Job ───────────────── Stage (stage_id) SecurityScan ──────── Pipeline + Job (pipeline_id, build_id) SecurityFinding ───── SecurityScan + VulnerabilityScanner (scan_id, scanner_id) Vulnerability ─────── User (author_id) VulnerabilityOccurrence ─ Vulnerability + Scanner + Identifier VulnerabilityMergeRequestLink ─ Vulnerability + MergeRequest MergeRequestClosingIssue ──── MergeRequest + WorkItem WorkItemParentLink ── WorkItem (parent) + WorkItem (child) IssueLink ─────────── WorkItem (source) + WorkItem (target) Member ────────────── Project + User (source_id, user_id) ``` Notes are split between MergeRequest and WorkItem parents proportionally to their respective counts. Work item parent links form a flat hierarchy (all children point to the first work item in the project). ## Synthetic ID scheme IDs are computed arithmetically so that every row across every table gets a globally unique, deterministic ID with no coordination or sequence counters. ``` table_id_base = next_entity_id + (table_position * block_size) block_size = project_count * max_rows_per_project + 1 row_id = table_id_base + (project_index * rows_per_project) + entity_index ``` `table_position` is a fixed ordinal for each project-scoped table (0 for merge_requests, 1 for work_items, etc.), defined in `catalog.rs`. This spreads each table's ID range into a non-overlapping block. Child tables compute parent IDs using the parent table's base and the same formula, so referential links are computed without lookups. ## Fake value generation The generator auto-fills any column not explicitly set by a relationship writer. Column names are classified by pattern matching into kinds, and each kind produces plausible-looking synthetic data: | Column pattern | Kind | Example output | |---------------|------|----------------| | `id`, `*_id` | Id | `42317` | | `iid` | Iid | `1` through `10000` | | `*email*` | Email | `user12ab@example.com` | | `*url*` | Url | `https://example.com/a1b2/c3d4` | | `*sha*`, `*hash*`, `*fingerprint*` | Sha | 40-char hex string | | `*path*` | Path | `/p1a/d2b/c3d4e5f6` | | `*name*`, `*title*` | Name | `alpha_a1b2c3d4` | | `*description*`, `*body*`, `*note*` | Description | `Lorem ipsum dolor a1b2` | | `*status*` | Status | `open`, `closed`, `merged`, `pending`, `active` | | `*state*` | State | `pending`, `running`, `success`, `failed`, `canceled` | | `*ref*`, `*branch*` | Branch | `feature/branch-a1b2` | | `uuid`, `*_uuid` | Uuid | RFC 4122 format | | `*_ids` | IdList | `[1, 2345, 6789]` | | `*_at`, `created_at`, `updated_at` | DateTime | timestamp within last 5 years | | anything else (string) | GenericString | `val<hex>` | Nullable columns have a ~10% chance of being null. ### Field overrides The `field_overrides` config section constrains generated values for enum-like columns to valid domain values. Without overrides, status/state columns pick from a small hardcoded pool. With overrides, the generator picks uniformly from the provided list: ```yaml field_overrides: MergeRequest: state_id: [1, 2, 3, 4] merge_status: ["unchecked", "can_be_merged", "cannot_be_merged"] ``` ### Determinism All generation is seeded. The base seed (default `42`) is XORed with a per-table offset so each table gets a different random sequence while remaining fully reproducible. The RNG is `SmallRng` seeded via `seed_from_u64`, and the counter is mixed with the golden ratio hash (`0x9e3779b97f4a7c15`) for better bit distribution. ## Limitations **Single organization.** The generator always uses `organization_id = 1`. Multi-org generation is not supported. **No schema DDL.** The generator reads schemas from a running ClickHouse instance. Tables must already exist; it does not create or migrate them. Tables missing from ClickHouse are silently skipped. **Uniform distributions.** Entities are spread evenly across parents. Real GitLab data has power-law distributions (some projects have thousands of MRs, most have few). The generator does not model this skew. **Flat work item hierarchies.** Work item parent links all point to the first work item in each project. Real work items form deeper trees. **No cross-project relationships.** MergeRequest source and target projects are always the same project. Forked-project MR workflows are not modeled. **No code data.** The generator covers SDLC entities only. It does not produce code indexing data (call graphs, definitions, references). **No temporal consistency.** Timestamps are generated independently per column. A merge request's `merged_at` might precede its `created_at`. **String content is synthetic.** Names, descriptions, and other text fields are lorem-ipsum-style placeholders. They do not resemble real GitLab content. **Fixed column type support.** The generator handles `Int64`, `Int8`, `Utf8`, `Boolean`, `Float64`, `Date32`, and `List<Int64>`. Any other Arrow data type falls back to null. **Truncation on re-run.** The seeding phase truncates all stage tables before writing. Running the generator against a database with real data would destroy it. ## Module structure - `src/lib.rs`: top-level `run()` entrypoint that orchestrates the full pipeline - `src/domain/`: domain model - `foundation.rs`: foundation entities and ID allocation - `layout.rs`: per-table row counts and synthetic ID helpers - `src/seeding/`: seeding pipeline (what to generate and in what order) - `catalog.rs`: stage ordering and table metadata - `pipeline.rs`: concurrent batch generation and ClickHouse inserts - `state_builder.rs`: builds `HierarchyState` after seeding - `src/data_generation/`: row-level building toolkit (how to construct Arrow batches) - `schema_registry.rs`: fetches and caches Arrow schemas from ClickHouse - `row_builder.rs`: `DirectBatchBuilder` for columnar Arrow array construction - `fake_values.rs`: deterministic fake value generation per column type - `src/continuous.rs`: continuous insert/update/delete traffic after initial seed - `src/state.rs`: compressed state persistence (`save` / `load`) ## Configuration Use `crates/datalake-generator/datalake-generator.yaml` as the baseline. Key sections: - `datalake`: ClickHouse connection and database - `generation`: batch size, root counts, per-project counts, field overrides - `continuous`: continuous mode controls - `metrics`: report output - `state`: state directory ## Metrics output When metrics are enabled, the generator writes: - JSON report at `metrics.output_path` - stdout summary with duration, table row counts, and resource usage