perf(compiler): close 4 query cliffs (6fdc8963) · Commits · GitLab.org / orbit / GitLab Knowledge Graph

config/SCHEMA_VERSION

+1 −1

Original line number	Diff line number	Diff line
		10
		11

config/graph.sql

+10 −2

Original line number	Diff line number	Diff line
		-- SCHEMA_VERSION=10 [AUTOGENERATED — DO NOT EDIT MANUALLY]
		-- SCHEMA_VERSION=11 [AUTOGENERATED — DO NOT EDIT MANUALLY]
		-- Regenerate with: mise schema:generate:ddl

		CREATE TABLE IF NOT EXISTS checkpoint (
		@@ -749,7 +749,15 @@ CREATE TABLE IF NOT EXISTS gl_code_edge (
		_deleted Bool DEFAULT false,
		INDEX idx_relationship relationship_kind TYPE set(50) GRANULARITY 2,
		PROJECTION by_source (SELECT * ORDER BY (source_id, relationship_kind, target_id, traversal_path, source_kind, target_kind)),
		PROJECTION by_target (SELECT * ORDER BY (target_id, relationship_kind, source_id, traversal_path, source_kind, target_kind))
		PROJECTION by_target (SELECT * ORDER BY (target_id, relationship_kind, source_id, traversal_path, source_kind, target_kind)),
		PROJECTION agg_counts_by_source (
		SELECT relationship_kind, target_kind, source_id, traversal_path, count()
		GROUP BY relationship_kind, target_kind, source_id, traversal_path
		),
		PROJECTION agg_counts_by_target (
		SELECT relationship_kind, source_kind, target_id, traversal_path, count()
		GROUP BY relationship_kind, source_kind, target_id, traversal_path
		)
		) ENGINE = ReplacingMergeTree(_version, _deleted)
		ORDER BY (traversal_path, source_id, relationship_kind, target_id, source_kind, target_kind)
		PRIMARY KEY (traversal_path, source_id, relationship_kind)

config/graph_local.sql

+1 −1

Original line number	Diff line number	Diff line
		-- SCHEMA_VERSION=10 [AUTOGENERATED — DO NOT EDIT MANUALLY]
		-- SCHEMA_VERSION=11 [AUTOGENERATED — DO NOT EDIT MANUALLY]
		-- Regenerate with: mise schema:generate:ddl:local

		CREATE TABLE IF NOT EXISTS gl_definition (

config/ontology/schema.yaml

+15 −0

Original line number	Diff line number	Diff line
		@@ -84,6 +84,21 @@ settings:
		- type: reorder
		name: by_target
		order_by: [target_id, relationship_kind, source_id, traversal_path, source_kind, target_kind]
		# `count(target) GROUP BY source` — e.g. definitions per file via
		# DEFINES. Without this, the wildcard scan over vendored JS bundles
		# (`echarts.js` ≈ 13K defs, `pdf.worker.js` ≈ 12K defs) materializes
		# every edge row before counting.
		- type: aggregate
		name: agg_counts_by_source
		select: [relationship_kind, target_kind, source_id, traversal_path, "count()"]
		group_by: [relationship_kind, target_kind, source_id, traversal_path]
		# Mirrors `gl_edge.agg_counts` for the inverse direction
		# (`count(source) GROUP BY target` — e.g. files importing a given
		# definition via IMPORTS).
		- type: aggregate
		name: agg_counts_by_target
		select: [relationship_kind, source_kind, target_id, traversal_path, "count()"]
		group_by: [relationship_kind, source_kind, target_id, traversal_path]
		internal_column_prefix: "_gkg_"
		skip_security_filter_for_entities: [User]
		local_db:

crates/query-engine/compiler/src/lib.rs

+134 −0

Original line number	Diff line number	Diff line
		@@ -278,4 +278,138 @@ mod tests {
		"relationship_kind filter must survive, got:\n{sql}"
		);
		}

		/// Org-wide `count(target) GROUP BY group` with no filters and no pinned
		/// IDs must emit bare `COUNT()` so ClickHouse can route to the
		/// `agg_counts` projection. With `COUNT(e0.source_id)`, projection
		/// routing fails and the query scans the full edge slice (1B+ rows for
		/// File → Project on production scale).
		#[test]
		fn unfiltered_edge_only_count_emits_bare_count_for_projection_routing() {
		let ontology = Ontology::load_embedded().expect("ontology must load");

		let query = r#"{
		"query_type": "aggregation",
		"nodes": [
		{"id": "p", "entity": "Project"},
		{"id": "f", "entity": "File"}
		],
		"relationships": [{"type": "IN_PROJECT", "from": "f", "to": "p"}],
		"aggregations": [{
		"function": "count",
		"target": "f",
		"group_by": "p",
		"alias": "files"
		}],
		"limit": 10
		}"#;

		let compiled = compile(query, &ontology, &security_ctx()).expect("should compile");
		let sql = compiled.base.render();

		assert!(
		sql.contains("COUNT()") \|\| sql.contains("count()"),
		"unfiltered edge-only count must emit bare COUNT() for projection \
		routing, got:\n{sql}"
		);
		assert!(
		!sql.contains("COUNT(e0.source_id)") && !sql.contains("count(e0.source_id)"),
		"must not emit COUNT(source_id) -- breaks agg_counts projection routing, \
		got:\n{sql}"
		);
		}

		/// When the target node has filters that fold into `countIf`, the column
		/// reference is required so the `-If` combinator has something to count.
		/// Bare `count()` would lose the filter semantics.
		#[test]
		fn filtered_edge_only_count_keeps_column_arg_for_count_if() {
		let ontology = Ontology::load_embedded().expect("ontology must load");

		let query = r#"{
		"query_type": "aggregation",
		"nodes": [
		{"id": "p", "entity": "Project"},
		{"id": "mr", "entity": "MergeRequest", "filters": {
		"state": {"op": "eq", "value": "opened"}
		}}
		],
		"relationships": [{"type": "IN_PROJECT", "from": "mr", "to": "p"}],
		"aggregations": [{
		"function": "count",
		"target": "mr",
		"group_by": "p",
		"alias": "open_mrs"
		}],
		"limit": 10
		}"#;

		let compiled = compile(query, &ontology, &security_ctx()).expect("should compile");
		let sql = compiled.base.render();

		// Filter is captured via SIP CTE (`_nf_mr` filtered by state='opened',
		// edge `source_id IN _nf_mr`). The COUNT must reference the edge column
		// so that filtered edges are what's counted; emitting bare `count()`
		// here would not be wrong (the IN-filter still bounds rows) but would
		// suppress the projection-routing decision based on filter presence.
		assert!(
		sql.contains("COUNT(e0.source_id)")
		\|\| sql.contains("countIf")
		\|\| sql.contains("COUNTIF"),
		"filtered count must reference edge column or use countIf, got:\n{sql}"
		);
		assert!(
		sql.contains("state = 'opened'"),
		"state filter must reach the SQL (in _nf_* or WHERE), got:\n{sql}"
		);
		}

		/// Multi-hop traversal must constrain `target_kind`/`source_kind` on
		/// EVERY edge it touches, not just whichever side `node_edge_col`
		/// happens to map first. Without this, `User AUTHORED MR` joined to
		/// `MR IN_PROJECT Project` can match `User AUTHORED <other entity>`
		/// rows whose ID happens to collide with an MR ID, producing edges
		/// with garbage `target_kind` in the result and skipping kind-PK
		/// pruning on the second-hop edge.
		#[test]
		fn multi_hop_traversal_constrains_kind_on_every_edge() {
		let ontology = Ontology::load_embedded().expect("ontology must load");

		let query = r#"{
		"query_type": "traversal",
		"nodes": [
		{"id": "p", "entity": "Project"},
		{"id": "mr", "entity": "MergeRequest"},
		{"id": "u", "entity": "User"}
		],
		"relationships": [
		{"type": "IN_PROJECT", "from": "mr", "to": "p"},
		{"type": "AUTHORED", "from": "u", "to": "mr"}
		],
		"limit": 5
		}"#;

		let compiled = compile(query, &ontology, &security_ctx()).expect("should compile");
		let sql = compiled.base.render();

		// e0 covers IN_PROJECT (mr → p): both sides must be constrained.
		assert!(
		sql.contains("e0.source_kind = 'MergeRequest'"),
		"e0 must constrain source_kind=MergeRequest, got:\n{sql}"
		);
		assert!(
		sql.contains("e0.target_kind = 'Project'"),
		"e0 must constrain target_kind=Project, got:\n{sql}"
		);
		// e1 covers AUTHORED (u → mr): the previously missing target_kind
		// constraint that lets bogus edges leak through.
		assert!(
		sql.contains("e1.source_kind = 'User'"),
		"e1 must constrain source_kind=User, got:\n{sql}"
		);
		assert!(
		sql.contains("e1.target_kind = 'MergeRequest'"),
		"e1 must constrain target_kind=MergeRequest (R2 cliff), got:\n{sql}"
		);
		}
		}