Loading config/SCHEMA_VERSION +1 −1 Original line number Diff line number Diff line 10 11 config/graph.sql +10 −2 Original line number Diff line number Diff line -- SCHEMA_VERSION=10 [AUTOGENERATED — DO NOT EDIT MANUALLY] -- SCHEMA_VERSION=11 [AUTOGENERATED — DO NOT EDIT MANUALLY] -- Regenerate with: mise schema:generate:ddl CREATE TABLE IF NOT EXISTS checkpoint ( Loading Loading @@ -749,7 +749,15 @@ CREATE TABLE IF NOT EXISTS gl_code_edge ( _deleted Bool DEFAULT false, INDEX idx_relationship relationship_kind TYPE set(50) GRANULARITY 2, PROJECTION by_source (SELECT * ORDER BY (source_id, relationship_kind, target_id, traversal_path, source_kind, target_kind)), PROJECTION by_target (SELECT * ORDER BY (target_id, relationship_kind, source_id, traversal_path, source_kind, target_kind)) PROJECTION by_target (SELECT * ORDER BY (target_id, relationship_kind, source_id, traversal_path, source_kind, target_kind)), PROJECTION agg_counts_by_source ( SELECT relationship_kind, target_kind, source_id, traversal_path, count() GROUP BY relationship_kind, target_kind, source_id, traversal_path ), PROJECTION agg_counts_by_target ( SELECT relationship_kind, source_kind, target_id, traversal_path, count() GROUP BY relationship_kind, source_kind, target_id, traversal_path ) ) ENGINE = ReplacingMergeTree(_version, _deleted) ORDER BY (traversal_path, source_id, relationship_kind, target_id, source_kind, target_kind) PRIMARY KEY (traversal_path, source_id, relationship_kind) Loading config/graph_local.sql +1 −1 Original line number Diff line number Diff line -- SCHEMA_VERSION=10 [AUTOGENERATED — DO NOT EDIT MANUALLY] -- SCHEMA_VERSION=11 [AUTOGENERATED — DO NOT EDIT MANUALLY] -- Regenerate with: mise schema:generate:ddl:local CREATE TABLE IF NOT EXISTS gl_definition ( Loading config/ontology/schema.yaml +15 −0 Original line number Diff line number Diff line Loading @@ -84,6 +84,21 @@ settings: - type: reorder name: by_target order_by: [target_id, relationship_kind, source_id, traversal_path, source_kind, target_kind] # `count(target) GROUP BY source` — e.g. definitions per file via # DEFINES. Without this, the wildcard scan over vendored JS bundles # (`echarts.js` ≈ 13K defs, `pdf.worker.js` ≈ 12K defs) materializes # every edge row before counting. - type: aggregate name: agg_counts_by_source select: [relationship_kind, target_kind, source_id, traversal_path, "count()"] group_by: [relationship_kind, target_kind, source_id, traversal_path] # Mirrors `gl_edge.agg_counts` for the inverse direction # (`count(source) GROUP BY target` — e.g. files importing a given # definition via IMPORTS). - type: aggregate name: agg_counts_by_target select: [relationship_kind, source_kind, target_id, traversal_path, "count()"] group_by: [relationship_kind, source_kind, target_id, traversal_path] internal_column_prefix: "_gkg_" skip_security_filter_for_entities: [User] local_db: Loading crates/query-engine/compiler/src/lib.rs +134 −0 Original line number Diff line number Diff line Loading @@ -278,4 +278,138 @@ mod tests { "relationship_kind filter must survive, got:\n{sql}" ); } /// Org-wide `count(target) GROUP BY group` with no filters and no pinned /// IDs must emit bare `COUNT()` so ClickHouse can route to the /// `agg_counts` projection. With `COUNT(e0.source_id)`, projection /// routing fails and the query scans the full edge slice (1B+ rows for /// File → Project on production scale). #[test] fn unfiltered_edge_only_count_emits_bare_count_for_projection_routing() { let ontology = Ontology::load_embedded().expect("ontology must load"); let query = r#"{ "query_type": "aggregation", "nodes": [ {"id": "p", "entity": "Project"}, {"id": "f", "entity": "File"} ], "relationships": [{"type": "IN_PROJECT", "from": "f", "to": "p"}], "aggregations": [{ "function": "count", "target": "f", "group_by": "p", "alias": "files" }], "limit": 10 }"#; let compiled = compile(query, &ontology, &security_ctx()).expect("should compile"); let sql = compiled.base.render(); assert!( sql.contains("COUNT()") || sql.contains("count()"), "unfiltered edge-only count must emit bare COUNT() for projection \ routing, got:\n{sql}" ); assert!( !sql.contains("COUNT(e0.source_id)") && !sql.contains("count(e0.source_id)"), "must not emit COUNT(source_id) -- breaks agg_counts projection routing, \ got:\n{sql}" ); } /// When the target node has filters that fold into `countIf`, the column /// reference is required so the `-If` combinator has something to count. /// Bare `count()` would lose the filter semantics. #[test] fn filtered_edge_only_count_keeps_column_arg_for_count_if() { let ontology = Ontology::load_embedded().expect("ontology must load"); let query = r#"{ "query_type": "aggregation", "nodes": [ {"id": "p", "entity": "Project"}, {"id": "mr", "entity": "MergeRequest", "filters": { "state": {"op": "eq", "value": "opened"} }} ], "relationships": [{"type": "IN_PROJECT", "from": "mr", "to": "p"}], "aggregations": [{ "function": "count", "target": "mr", "group_by": "p", "alias": "open_mrs" }], "limit": 10 }"#; let compiled = compile(query, &ontology, &security_ctx()).expect("should compile"); let sql = compiled.base.render(); // Filter is captured via SIP CTE (`_nf_mr` filtered by state='opened', // edge `source_id IN _nf_mr`). The COUNT must reference the edge column // so that filtered edges are what's counted; emitting bare `count()` // here would not be wrong (the IN-filter still bounds rows) but would // suppress the projection-routing decision based on filter presence. assert!( sql.contains("COUNT(e0.source_id)") || sql.contains("countIf") || sql.contains("COUNTIF"), "filtered count must reference edge column or use countIf, got:\n{sql}" ); assert!( sql.contains("state = 'opened'"), "state filter must reach the SQL (in _nf_* or WHERE), got:\n{sql}" ); } /// Multi-hop traversal must constrain `target_kind`/`source_kind` on /// EVERY edge it touches, not just whichever side `node_edge_col` /// happens to map first. Without this, `User AUTHORED MR` joined to /// `MR IN_PROJECT Project` can match `User AUTHORED <other entity>` /// rows whose ID happens to collide with an MR ID, producing edges /// with garbage `target_kind` in the result and skipping kind-PK /// pruning on the second-hop edge. #[test] fn multi_hop_traversal_constrains_kind_on_every_edge() { let ontology = Ontology::load_embedded().expect("ontology must load"); let query = r#"{ "query_type": "traversal", "nodes": [ {"id": "p", "entity": "Project"}, {"id": "mr", "entity": "MergeRequest"}, {"id": "u", "entity": "User"} ], "relationships": [ {"type": "IN_PROJECT", "from": "mr", "to": "p"}, {"type": "AUTHORED", "from": "u", "to": "mr"} ], "limit": 5 }"#; let compiled = compile(query, &ontology, &security_ctx()).expect("should compile"); let sql = compiled.base.render(); // e0 covers IN_PROJECT (mr → p): both sides must be constrained. assert!( sql.contains("e0.source_kind = 'MergeRequest'"), "e0 must constrain source_kind=MergeRequest, got:\n{sql}" ); assert!( sql.contains("e0.target_kind = 'Project'"), "e0 must constrain target_kind=Project, got:\n{sql}" ); // e1 covers AUTHORED (u → mr): the previously missing target_kind // constraint that lets bogus edges leak through. assert!( sql.contains("e1.source_kind = 'User'"), "e1 must constrain source_kind=User, got:\n{sql}" ); assert!( sql.contains("e1.target_kind = 'MergeRequest'"), "e1 must constrain target_kind=MergeRequest (R2 cliff), got:\n{sql}" ); } } Loading
config/graph.sql +10 −2 Original line number Diff line number Diff line -- SCHEMA_VERSION=10 [AUTOGENERATED — DO NOT EDIT MANUALLY] -- SCHEMA_VERSION=11 [AUTOGENERATED — DO NOT EDIT MANUALLY] -- Regenerate with: mise schema:generate:ddl CREATE TABLE IF NOT EXISTS checkpoint ( Loading Loading @@ -749,7 +749,15 @@ CREATE TABLE IF NOT EXISTS gl_code_edge ( _deleted Bool DEFAULT false, INDEX idx_relationship relationship_kind TYPE set(50) GRANULARITY 2, PROJECTION by_source (SELECT * ORDER BY (source_id, relationship_kind, target_id, traversal_path, source_kind, target_kind)), PROJECTION by_target (SELECT * ORDER BY (target_id, relationship_kind, source_id, traversal_path, source_kind, target_kind)) PROJECTION by_target (SELECT * ORDER BY (target_id, relationship_kind, source_id, traversal_path, source_kind, target_kind)), PROJECTION agg_counts_by_source ( SELECT relationship_kind, target_kind, source_id, traversal_path, count() GROUP BY relationship_kind, target_kind, source_id, traversal_path ), PROJECTION agg_counts_by_target ( SELECT relationship_kind, source_kind, target_id, traversal_path, count() GROUP BY relationship_kind, source_kind, target_id, traversal_path ) ) ENGINE = ReplacingMergeTree(_version, _deleted) ORDER BY (traversal_path, source_id, relationship_kind, target_id, source_kind, target_kind) PRIMARY KEY (traversal_path, source_id, relationship_kind) Loading
config/graph_local.sql +1 −1 Original line number Diff line number Diff line -- SCHEMA_VERSION=10 [AUTOGENERATED — DO NOT EDIT MANUALLY] -- SCHEMA_VERSION=11 [AUTOGENERATED — DO NOT EDIT MANUALLY] -- Regenerate with: mise schema:generate:ddl:local CREATE TABLE IF NOT EXISTS gl_definition ( Loading
config/ontology/schema.yaml +15 −0 Original line number Diff line number Diff line Loading @@ -84,6 +84,21 @@ settings: - type: reorder name: by_target order_by: [target_id, relationship_kind, source_id, traversal_path, source_kind, target_kind] # `count(target) GROUP BY source` — e.g. definitions per file via # DEFINES. Without this, the wildcard scan over vendored JS bundles # (`echarts.js` ≈ 13K defs, `pdf.worker.js` ≈ 12K defs) materializes # every edge row before counting. - type: aggregate name: agg_counts_by_source select: [relationship_kind, target_kind, source_id, traversal_path, "count()"] group_by: [relationship_kind, target_kind, source_id, traversal_path] # Mirrors `gl_edge.agg_counts` for the inverse direction # (`count(source) GROUP BY target` — e.g. files importing a given # definition via IMPORTS). - type: aggregate name: agg_counts_by_target select: [relationship_kind, source_kind, target_id, traversal_path, "count()"] group_by: [relationship_kind, source_kind, target_id, traversal_path] internal_column_prefix: "_gkg_" skip_security_filter_for_entities: [User] local_db: Loading
crates/query-engine/compiler/src/lib.rs +134 −0 Original line number Diff line number Diff line Loading @@ -278,4 +278,138 @@ mod tests { "relationship_kind filter must survive, got:\n{sql}" ); } /// Org-wide `count(target) GROUP BY group` with no filters and no pinned /// IDs must emit bare `COUNT()` so ClickHouse can route to the /// `agg_counts` projection. With `COUNT(e0.source_id)`, projection /// routing fails and the query scans the full edge slice (1B+ rows for /// File → Project on production scale). #[test] fn unfiltered_edge_only_count_emits_bare_count_for_projection_routing() { let ontology = Ontology::load_embedded().expect("ontology must load"); let query = r#"{ "query_type": "aggregation", "nodes": [ {"id": "p", "entity": "Project"}, {"id": "f", "entity": "File"} ], "relationships": [{"type": "IN_PROJECT", "from": "f", "to": "p"}], "aggregations": [{ "function": "count", "target": "f", "group_by": "p", "alias": "files" }], "limit": 10 }"#; let compiled = compile(query, &ontology, &security_ctx()).expect("should compile"); let sql = compiled.base.render(); assert!( sql.contains("COUNT()") || sql.contains("count()"), "unfiltered edge-only count must emit bare COUNT() for projection \ routing, got:\n{sql}" ); assert!( !sql.contains("COUNT(e0.source_id)") && !sql.contains("count(e0.source_id)"), "must not emit COUNT(source_id) -- breaks agg_counts projection routing, \ got:\n{sql}" ); } /// When the target node has filters that fold into `countIf`, the column /// reference is required so the `-If` combinator has something to count. /// Bare `count()` would lose the filter semantics. #[test] fn filtered_edge_only_count_keeps_column_arg_for_count_if() { let ontology = Ontology::load_embedded().expect("ontology must load"); let query = r#"{ "query_type": "aggregation", "nodes": [ {"id": "p", "entity": "Project"}, {"id": "mr", "entity": "MergeRequest", "filters": { "state": {"op": "eq", "value": "opened"} }} ], "relationships": [{"type": "IN_PROJECT", "from": "mr", "to": "p"}], "aggregations": [{ "function": "count", "target": "mr", "group_by": "p", "alias": "open_mrs" }], "limit": 10 }"#; let compiled = compile(query, &ontology, &security_ctx()).expect("should compile"); let sql = compiled.base.render(); // Filter is captured via SIP CTE (`_nf_mr` filtered by state='opened', // edge `source_id IN _nf_mr`). The COUNT must reference the edge column // so that filtered edges are what's counted; emitting bare `count()` // here would not be wrong (the IN-filter still bounds rows) but would // suppress the projection-routing decision based on filter presence. assert!( sql.contains("COUNT(e0.source_id)") || sql.contains("countIf") || sql.contains("COUNTIF"), "filtered count must reference edge column or use countIf, got:\n{sql}" ); assert!( sql.contains("state = 'opened'"), "state filter must reach the SQL (in _nf_* or WHERE), got:\n{sql}" ); } /// Multi-hop traversal must constrain `target_kind`/`source_kind` on /// EVERY edge it touches, not just whichever side `node_edge_col` /// happens to map first. Without this, `User AUTHORED MR` joined to /// `MR IN_PROJECT Project` can match `User AUTHORED <other entity>` /// rows whose ID happens to collide with an MR ID, producing edges /// with garbage `target_kind` in the result and skipping kind-PK /// pruning on the second-hop edge. #[test] fn multi_hop_traversal_constrains_kind_on_every_edge() { let ontology = Ontology::load_embedded().expect("ontology must load"); let query = r#"{ "query_type": "traversal", "nodes": [ {"id": "p", "entity": "Project"}, {"id": "mr", "entity": "MergeRequest"}, {"id": "u", "entity": "User"} ], "relationships": [ {"type": "IN_PROJECT", "from": "mr", "to": "p"}, {"type": "AUTHORED", "from": "u", "to": "mr"} ], "limit": 5 }"#; let compiled = compile(query, &ontology, &security_ctx()).expect("should compile"); let sql = compiled.base.render(); // e0 covers IN_PROJECT (mr → p): both sides must be constrained. assert!( sql.contains("e0.source_kind = 'MergeRequest'"), "e0 must constrain source_kind=MergeRequest, got:\n{sql}" ); assert!( sql.contains("e0.target_kind = 'Project'"), "e0 must constrain target_kind=Project, got:\n{sql}" ); // e1 covers AUTHORED (u → mr): the previously missing target_kind // constraint that lets bogus edges leak through. assert!( sql.contains("e1.source_kind = 'User'"), "e1 must constrain source_kind=User, got:\n{sql}" ); assert!( sql.contains("e1.target_kind = 'MergeRequest'"), "e1 must constrain target_kind=MergeRequest (R2 cliff), got:\n{sql}" ); } }