Loading crates/integration-tests/tests/compiler/dialects/clickhouse.rs +3 −1 Original line number Diff line number Diff line Loading @@ -13,7 +13,9 @@ fn compile_to_ast_works() { }"#; let node = compile_to_ast(json, &test_ontology()).unwrap(); let Node::Query(ref q) = node; let Node::Query(ref q) = node else { unreachable!() }; assert_eq!(q.limit, Some(10)); assert!(!q.select.is_empty()); } Loading crates/query-engine/compiler/src/ast/ddl.rs +5 −0 Original line number Diff line number Diff line Loading @@ -35,15 +35,20 @@ pub struct ColumnDef { pub enum ColumnType { Int64, UInt64, UInt32, Bool, String, Date32, /// Plain `DateTime` (second precision, no timezone). DateTime, /// Timestamp with sub-second precision and optional timezone. /// Precision must be 0–9 for ClickHouse (`DateTime64`). Timestamp { precision: u8, timezone: Option<String>, }, /// ClickHouse `Enum8('label' = N, ...)`. Enum8(Vec<(std::string::String, i8)>), /// Wraps an inner type as nullable. Nullable(Box<ColumnType>), /// Dictionary-encoded / low-cardinality wrapper. Loading crates/query-engine/compiler/src/ast/dml.rs +69 −3 Original line number Diff line number Diff line Loading @@ -4,7 +4,9 @@ //! Each node maps directly to ClickHouse SQL constructs. use std::collections::HashSet; use std::sync::LazyLock; use regex::Regex; use serde_json::Value; pub use gkg_utils::clickhouse::{ChScalar, ChType}; Loading Loading @@ -92,8 +94,12 @@ pub enum Op { /// Source of rows in a FROM clause. #[derive(Debug, Clone, PartialEq)] pub enum TableRef { /// Read from a physical table → `table AS alias` Scan { table: String, alias: String }, /// Read from a physical table → `table [FINAL] AS alias` Scan { table: String, alias: String, final_: bool, }, /// Combine two sources → `left JOIN_TYPE JOIN right ON condition` Join { join_type: JoinType, Loading Loading @@ -208,6 +214,7 @@ impl Default for Query { from: TableRef::Scan { table: String::new(), alias: String::new(), final_: false, }, where_clause: None, group_by: vec![], Loading @@ -220,10 +227,60 @@ impl Default for Query { } } /// Top-level AST node - either a simple query or a recursive CTE. /// SQL identifier pattern: ASCII letter or underscore, then alphanumerics/underscores. static SAFE_IDENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").expect("valid regex")); /// `INSERT INTO table (cols) VALUES (row1), (row2), ...` /// /// Table and column names are interpolated as raw identifiers (not parameterized), /// so they are validated at construction time via [`Insert::new`]. Fields are /// private to enforce this — use the constructor. #[derive(Debug, Clone, PartialEq)] pub struct Insert { table: String, columns: Vec<String>, values: Vec<Vec<Expr>>, } impl Insert { pub fn new(table: impl Into<String>, columns: Vec<String>, values: Vec<Vec<Expr>>) -> Self { let table = table.into(); debug_assert!( SAFE_IDENT.is_match(&table), "INSERT table name is not a safe identifier: {table:?}" ); for col in &columns { debug_assert!( SAFE_IDENT.is_match(col), "INSERT column name is not a safe identifier: {col:?}" ); } Self { table, columns, values, } } pub fn table(&self) -> &str { &self.table } pub fn columns(&self) -> &[String] { &self.columns } pub fn values(&self) -> &[Vec<Expr>] { &self.values } } /// Top-level AST node. #[derive(Debug, Clone, PartialEq)] pub enum Node { Query(Box<Query>), Insert(Box<Insert>), } // ───────────────────────────────────────────────────────────────────────────── Loading Loading @@ -444,6 +501,15 @@ impl TableRef { TableRef::Scan { table: table.into(), alias: alias.into(), final_: false, } } pub fn scan_final(table: impl Into<String>, alias: impl Into<String>) -> Self { TableRef::Scan { table: table.into(), alias: alias.into(), final_: true, } } Loading crates/query-engine/compiler/src/passes/check.rs +1 −0 Original line number Diff line number Diff line Loading @@ -24,6 +24,7 @@ pub fn check_ast(node: &Node, ctx: &SecurityContext) -> Result<()> { } check_query(q, ctx) } Node::Insert(_) => Ok(()), } } Loading crates/query-engine/compiler/src/passes/codegen/clickhouse.rs +150 −9 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ use gkg_server_config::QueryConfig; use crate::ast::{ChType, Cte, Expr, JoinType, Node, Op, Query, TableRef}; use crate::ast::{ChType, Cte, Expr, Insert, JoinType, Node, Op, Query, TableRef}; use crate::error::Result; use crate::passes::enforce::ResultContext; use serde_json::Value; Loading @@ -21,11 +21,13 @@ pub fn codegen( let mut ctx = Context::new(); let mut sql = match ast { Node::Query(q) => ctx.emit_query(q)?, Node::Insert(ins) => ctx.emit_insert(ins), }; // SETTINGS — only on the top-level query, not subqueries/UNION arms. // SETTINGS — only on SELECT queries, not INSERT or subqueries/UNION arms. // Values are pre-formatted as SQL-safe literals by to_clickhouse_settings() // (bare integers, 0/1 bools, escaped quoted strings). if matches!(ast, Node::Query(_)) { let settings = query_config .to_clickhouse_settings() .map_err(crate::error::QueryError::Codegen)?; Loading @@ -33,6 +35,7 @@ pub fn codegen( let clause: Vec<String> = settings.iter().map(|(k, v)| format!("{k} = {v}")).collect(); sql.push_str(&format!(" SETTINGS {}", clause.join(", "))); } } Ok(ParameterizedQuery { sql, Loading @@ -43,6 +46,24 @@ pub fn codegen( }) } /// Emit a `Query` (or `Insert`) AST as parameterized ClickHouse SQL without /// requiring `ResultContext` or `QueryConfig`. /// /// # Trust boundary /// /// This function bypasses the compiler security pipeline (`apply_security_context`, /// `check_ast`, `enforce_return`). It must only be used for trusted, internally /// constructed ASTs (e.g. schema version management DDL/DML), never for /// user-supplied query input. pub fn emit_simple_query(node: &Node) -> Result<(String, HashMap<String, ParamValue>)> { let mut ctx = Context::new(); let sql = match node { Node::Query(q) => ctx.emit_query(q)?, Node::Insert(ins) => ctx.emit_insert(ins), }; Ok((sql, ctx.params)) } struct Context { params: HashMap<String, ParamValue>, } Loading @@ -54,6 +75,24 @@ impl Context { } } fn emit_insert(&mut self, ins: &Insert) -> String { let cols = ins.columns().join(", "); let rows: Vec<String> = ins .values() .iter() .map(|row| { let exprs: Vec<String> = row.iter().map(|e| self.emit_expr(e)).collect(); format!("({})", exprs.join(", ")) }) .collect(); format!( "INSERT INTO {} ({}) VALUES {}", ins.table(), cols, rows.join(", ") ) } fn emit_query(&mut self, q: &Query) -> Result<String> { let mut parts = Vec::new(); Loading Loading @@ -258,7 +297,17 @@ impl Context { fn emit_table_ref(&mut self, t: &TableRef) -> Result<String> { match t { TableRef::Scan { table, alias } => Ok(format!("{table} AS {alias}")), TableRef::Scan { table, alias, final_, } => { if *final_ { Ok(format!("{table} FINAL AS {alias}")) } else { Ok(format!("{table} AS {alias}")) } } TableRef::Join { join_type, left, Loading Loading @@ -800,6 +849,98 @@ mod tests { assert!(result.sql.contains(") AS all_edges")); } #[test] fn insert_values() { let ins = Insert::new( "gl_schema_versions", vec!["key".into(), "version".into()], vec![ vec![Expr::string("graph"), Expr::int(3)], vec![Expr::string("datalake"), Expr::int(1)], ], ); let result = codegen( &Node::Insert(Box::new(ins)), empty_ctx(), QueryConfig::default(), ) .unwrap(); assert_eq!( result.sql, "INSERT INTO gl_schema_versions (key, version) VALUES ({p0:String}, {p1:Int64}), ({p2:String}, {p3:Int64})" ); assert_eq!( result.params.get("p0").map(|p| &p.value), Some(&Value::String("graph".into())) ); assert_eq!( result.params.get("p1").map(|p| &p.value), Some(&Value::Number(3.into())) ); } #[test] fn insert_skips_settings() { let ins = Insert::new("t", vec!["a".into()], vec![vec![Expr::int(1)]]); let cfg = QueryConfig { max_execution_time: None, use_query_cache: Some(true), query_cache_ttl: Some(60), }; let result = codegen(&Node::Insert(Box::new(ins)), empty_ctx(), cfg).unwrap(); assert!( !result.sql.contains("SETTINGS"), "INSERT should not have SETTINGS: {}", result.sql, ); } #[test] fn scan_final() { let q = Query { select: vec![SelectExpr::new(Expr::col("t", "id"), "id")], from: TableRef::scan_final("gl_schema_versions", "t"), ..Default::default() }; let result = codegen( &Node::Query(Box::new(q)), empty_ctx(), QueryConfig::default(), ) .unwrap(); assert_eq!( result.sql, "SELECT t.id AS id FROM gl_schema_versions FINAL AS t" ); } #[test] fn emit_simple_query_with_final_and_params() { let q = Query { select: vec![ SelectExpr::new(Expr::col("t", "key"), "key"), SelectExpr::new(Expr::col("t", "version"), "version"), ], from: TableRef::scan_final("gl_schema_versions", "t"), where_clause: Some(Expr::eq(Expr::col("t", "key"), Expr::string("graph"))), ..Default::default() }; let (sql, params) = emit_simple_query(&Node::Query(Box::new(q))).unwrap(); assert_eq!( sql, "SELECT t.key AS key, t.version AS version FROM gl_schema_versions FINAL AS t WHERE (t.key = {p0:String})" ); assert_eq!(params.len(), 1); assert_eq!( params.get("p0").map(|p| &p.value), Some(&Value::String("graph".into())) ); } #[test] fn render_replaces_scalar_params() { let mut params = HashMap::new(); Loading Loading
crates/integration-tests/tests/compiler/dialects/clickhouse.rs +3 −1 Original line number Diff line number Diff line Loading @@ -13,7 +13,9 @@ fn compile_to_ast_works() { }"#; let node = compile_to_ast(json, &test_ontology()).unwrap(); let Node::Query(ref q) = node; let Node::Query(ref q) = node else { unreachable!() }; assert_eq!(q.limit, Some(10)); assert!(!q.select.is_empty()); } Loading
crates/query-engine/compiler/src/ast/ddl.rs +5 −0 Original line number Diff line number Diff line Loading @@ -35,15 +35,20 @@ pub struct ColumnDef { pub enum ColumnType { Int64, UInt64, UInt32, Bool, String, Date32, /// Plain `DateTime` (second precision, no timezone). DateTime, /// Timestamp with sub-second precision and optional timezone. /// Precision must be 0–9 for ClickHouse (`DateTime64`). Timestamp { precision: u8, timezone: Option<String>, }, /// ClickHouse `Enum8('label' = N, ...)`. Enum8(Vec<(std::string::String, i8)>), /// Wraps an inner type as nullable. Nullable(Box<ColumnType>), /// Dictionary-encoded / low-cardinality wrapper. Loading
crates/query-engine/compiler/src/ast/dml.rs +69 −3 Original line number Diff line number Diff line Loading @@ -4,7 +4,9 @@ //! Each node maps directly to ClickHouse SQL constructs. use std::collections::HashSet; use std::sync::LazyLock; use regex::Regex; use serde_json::Value; pub use gkg_utils::clickhouse::{ChScalar, ChType}; Loading Loading @@ -92,8 +94,12 @@ pub enum Op { /// Source of rows in a FROM clause. #[derive(Debug, Clone, PartialEq)] pub enum TableRef { /// Read from a physical table → `table AS alias` Scan { table: String, alias: String }, /// Read from a physical table → `table [FINAL] AS alias` Scan { table: String, alias: String, final_: bool, }, /// Combine two sources → `left JOIN_TYPE JOIN right ON condition` Join { join_type: JoinType, Loading Loading @@ -208,6 +214,7 @@ impl Default for Query { from: TableRef::Scan { table: String::new(), alias: String::new(), final_: false, }, where_clause: None, group_by: vec![], Loading @@ -220,10 +227,60 @@ impl Default for Query { } } /// Top-level AST node - either a simple query or a recursive CTE. /// SQL identifier pattern: ASCII letter or underscore, then alphanumerics/underscores. static SAFE_IDENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").expect("valid regex")); /// `INSERT INTO table (cols) VALUES (row1), (row2), ...` /// /// Table and column names are interpolated as raw identifiers (not parameterized), /// so they are validated at construction time via [`Insert::new`]. Fields are /// private to enforce this — use the constructor. #[derive(Debug, Clone, PartialEq)] pub struct Insert { table: String, columns: Vec<String>, values: Vec<Vec<Expr>>, } impl Insert { pub fn new(table: impl Into<String>, columns: Vec<String>, values: Vec<Vec<Expr>>) -> Self { let table = table.into(); debug_assert!( SAFE_IDENT.is_match(&table), "INSERT table name is not a safe identifier: {table:?}" ); for col in &columns { debug_assert!( SAFE_IDENT.is_match(col), "INSERT column name is not a safe identifier: {col:?}" ); } Self { table, columns, values, } } pub fn table(&self) -> &str { &self.table } pub fn columns(&self) -> &[String] { &self.columns } pub fn values(&self) -> &[Vec<Expr>] { &self.values } } /// Top-level AST node. #[derive(Debug, Clone, PartialEq)] pub enum Node { Query(Box<Query>), Insert(Box<Insert>), } // ───────────────────────────────────────────────────────────────────────────── Loading Loading @@ -444,6 +501,15 @@ impl TableRef { TableRef::Scan { table: table.into(), alias: alias.into(), final_: false, } } pub fn scan_final(table: impl Into<String>, alias: impl Into<String>) -> Self { TableRef::Scan { table: table.into(), alias: alias.into(), final_: true, } } Loading
crates/query-engine/compiler/src/passes/check.rs +1 −0 Original line number Diff line number Diff line Loading @@ -24,6 +24,7 @@ pub fn check_ast(node: &Node, ctx: &SecurityContext) -> Result<()> { } check_query(q, ctx) } Node::Insert(_) => Ok(()), } } Loading
crates/query-engine/compiler/src/passes/codegen/clickhouse.rs +150 −9 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ use gkg_server_config::QueryConfig; use crate::ast::{ChType, Cte, Expr, JoinType, Node, Op, Query, TableRef}; use crate::ast::{ChType, Cte, Expr, Insert, JoinType, Node, Op, Query, TableRef}; use crate::error::Result; use crate::passes::enforce::ResultContext; use serde_json::Value; Loading @@ -21,11 +21,13 @@ pub fn codegen( let mut ctx = Context::new(); let mut sql = match ast { Node::Query(q) => ctx.emit_query(q)?, Node::Insert(ins) => ctx.emit_insert(ins), }; // SETTINGS — only on the top-level query, not subqueries/UNION arms. // SETTINGS — only on SELECT queries, not INSERT or subqueries/UNION arms. // Values are pre-formatted as SQL-safe literals by to_clickhouse_settings() // (bare integers, 0/1 bools, escaped quoted strings). if matches!(ast, Node::Query(_)) { let settings = query_config .to_clickhouse_settings() .map_err(crate::error::QueryError::Codegen)?; Loading @@ -33,6 +35,7 @@ pub fn codegen( let clause: Vec<String> = settings.iter().map(|(k, v)| format!("{k} = {v}")).collect(); sql.push_str(&format!(" SETTINGS {}", clause.join(", "))); } } Ok(ParameterizedQuery { sql, Loading @@ -43,6 +46,24 @@ pub fn codegen( }) } /// Emit a `Query` (or `Insert`) AST as parameterized ClickHouse SQL without /// requiring `ResultContext` or `QueryConfig`. /// /// # Trust boundary /// /// This function bypasses the compiler security pipeline (`apply_security_context`, /// `check_ast`, `enforce_return`). It must only be used for trusted, internally /// constructed ASTs (e.g. schema version management DDL/DML), never for /// user-supplied query input. pub fn emit_simple_query(node: &Node) -> Result<(String, HashMap<String, ParamValue>)> { let mut ctx = Context::new(); let sql = match node { Node::Query(q) => ctx.emit_query(q)?, Node::Insert(ins) => ctx.emit_insert(ins), }; Ok((sql, ctx.params)) } struct Context { params: HashMap<String, ParamValue>, } Loading @@ -54,6 +75,24 @@ impl Context { } } fn emit_insert(&mut self, ins: &Insert) -> String { let cols = ins.columns().join(", "); let rows: Vec<String> = ins .values() .iter() .map(|row| { let exprs: Vec<String> = row.iter().map(|e| self.emit_expr(e)).collect(); format!("({})", exprs.join(", ")) }) .collect(); format!( "INSERT INTO {} ({}) VALUES {}", ins.table(), cols, rows.join(", ") ) } fn emit_query(&mut self, q: &Query) -> Result<String> { let mut parts = Vec::new(); Loading Loading @@ -258,7 +297,17 @@ impl Context { fn emit_table_ref(&mut self, t: &TableRef) -> Result<String> { match t { TableRef::Scan { table, alias } => Ok(format!("{table} AS {alias}")), TableRef::Scan { table, alias, final_, } => { if *final_ { Ok(format!("{table} FINAL AS {alias}")) } else { Ok(format!("{table} AS {alias}")) } } TableRef::Join { join_type, left, Loading Loading @@ -800,6 +849,98 @@ mod tests { assert!(result.sql.contains(") AS all_edges")); } #[test] fn insert_values() { let ins = Insert::new( "gl_schema_versions", vec!["key".into(), "version".into()], vec![ vec![Expr::string("graph"), Expr::int(3)], vec![Expr::string("datalake"), Expr::int(1)], ], ); let result = codegen( &Node::Insert(Box::new(ins)), empty_ctx(), QueryConfig::default(), ) .unwrap(); assert_eq!( result.sql, "INSERT INTO gl_schema_versions (key, version) VALUES ({p0:String}, {p1:Int64}), ({p2:String}, {p3:Int64})" ); assert_eq!( result.params.get("p0").map(|p| &p.value), Some(&Value::String("graph".into())) ); assert_eq!( result.params.get("p1").map(|p| &p.value), Some(&Value::Number(3.into())) ); } #[test] fn insert_skips_settings() { let ins = Insert::new("t", vec!["a".into()], vec![vec![Expr::int(1)]]); let cfg = QueryConfig { max_execution_time: None, use_query_cache: Some(true), query_cache_ttl: Some(60), }; let result = codegen(&Node::Insert(Box::new(ins)), empty_ctx(), cfg).unwrap(); assert!( !result.sql.contains("SETTINGS"), "INSERT should not have SETTINGS: {}", result.sql, ); } #[test] fn scan_final() { let q = Query { select: vec![SelectExpr::new(Expr::col("t", "id"), "id")], from: TableRef::scan_final("gl_schema_versions", "t"), ..Default::default() }; let result = codegen( &Node::Query(Box::new(q)), empty_ctx(), QueryConfig::default(), ) .unwrap(); assert_eq!( result.sql, "SELECT t.id AS id FROM gl_schema_versions FINAL AS t" ); } #[test] fn emit_simple_query_with_final_and_params() { let q = Query { select: vec![ SelectExpr::new(Expr::col("t", "key"), "key"), SelectExpr::new(Expr::col("t", "version"), "version"), ], from: TableRef::scan_final("gl_schema_versions", "t"), where_clause: Some(Expr::eq(Expr::col("t", "key"), Expr::string("graph"))), ..Default::default() }; let (sql, params) = emit_simple_query(&Node::Query(Box::new(q))).unwrap(); assert_eq!( sql, "SELECT t.key AS key, t.version AS version FROM gl_schema_versions FINAL AS t WHERE (t.key = {p0:String})" ); assert_eq!(params.len(), 1); assert_eq!( params.get("p0").map(|p| &p.value), Some(&Value::String("graph".into())) ); } #[test] fn render_replaces_scalar_params() { let mut params = HashMap::new(); Loading