Verified Commit 3211e618 authored by Michael Usachenko's avatar Michael Usachenko Committed by GitLab
Browse files

feat(compiler): extend AST and codegen for schema version statements

parent 35e8eda9
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -13,7 +13,9 @@ fn compile_to_ast_works() {
    }"#;

    let node = compile_to_ast(json, &test_ontology()).unwrap();
    let Node::Query(ref q) = node;
    let Node::Query(ref q) = node else {
        unreachable!()
    };
    assert_eq!(q.limit, Some(10));
    assert!(!q.select.is_empty());
}
+5 −0
Original line number Diff line number Diff line
@@ -35,15 +35,20 @@ pub struct ColumnDef {
pub enum ColumnType {
    Int64,
    UInt64,
    UInt32,
    Bool,
    String,
    Date32,
    /// Plain `DateTime` (second precision, no timezone).
    DateTime,
    /// Timestamp with sub-second precision and optional timezone.
    /// Precision must be 0–9 for ClickHouse (`DateTime64`).
    Timestamp {
        precision: u8,
        timezone: Option<String>,
    },
    /// ClickHouse `Enum8('label' = N, ...)`.
    Enum8(Vec<(std::string::String, i8)>),
    /// Wraps an inner type as nullable.
    Nullable(Box<ColumnType>),
    /// Dictionary-encoded / low-cardinality wrapper.
+69 −3
Original line number Diff line number Diff line
@@ -4,7 +4,9 @@
//! Each node maps directly to ClickHouse SQL constructs.

use std::collections::HashSet;
use std::sync::LazyLock;

use regex::Regex;
use serde_json::Value;

pub use gkg_utils::clickhouse::{ChScalar, ChType};
@@ -92,8 +94,12 @@ pub enum Op {
/// Source of rows in a FROM clause.
#[derive(Debug, Clone, PartialEq)]
pub enum TableRef {
    /// Read from a physical table → `table AS alias`
    Scan { table: String, alias: String },
    /// Read from a physical table → `table [FINAL] AS alias`
    Scan {
        table: String,
        alias: String,
        final_: bool,
    },
    /// Combine two sources → `left JOIN_TYPE JOIN right ON condition`
    Join {
        join_type: JoinType,
@@ -208,6 +214,7 @@ impl Default for Query {
            from: TableRef::Scan {
                table: String::new(),
                alias: String::new(),
                final_: false,
            },
            where_clause: None,
            group_by: vec![],
@@ -220,10 +227,60 @@ impl Default for Query {
    }
}

/// Top-level AST node - either a simple query or a recursive CTE.
/// SQL identifier pattern: ASCII letter or underscore, then alphanumerics/underscores.
static SAFE_IDENT: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").expect("valid regex"));

/// `INSERT INTO table (cols) VALUES (row1), (row2), ...`
///
/// Table and column names are interpolated as raw identifiers (not parameterized),
/// so they are validated at construction time via [`Insert::new`]. Fields are
/// private to enforce this — use the constructor.
#[derive(Debug, Clone, PartialEq)]
pub struct Insert {
    table: String,
    columns: Vec<String>,
    values: Vec<Vec<Expr>>,
}

impl Insert {
    pub fn new(table: impl Into<String>, columns: Vec<String>, values: Vec<Vec<Expr>>) -> Self {
        let table = table.into();
        debug_assert!(
            SAFE_IDENT.is_match(&table),
            "INSERT table name is not a safe identifier: {table:?}"
        );
        for col in &columns {
            debug_assert!(
                SAFE_IDENT.is_match(col),
                "INSERT column name is not a safe identifier: {col:?}"
            );
        }
        Self {
            table,
            columns,
            values,
        }
    }

    pub fn table(&self) -> &str {
        &self.table
    }

    pub fn columns(&self) -> &[String] {
        &self.columns
    }

    pub fn values(&self) -> &[Vec<Expr>] {
        &self.values
    }
}

/// Top-level AST node.
#[derive(Debug, Clone, PartialEq)]
pub enum Node {
    Query(Box<Query>),
    Insert(Box<Insert>),
}

// ─────────────────────────────────────────────────────────────────────────────
@@ -444,6 +501,15 @@ impl TableRef {
        TableRef::Scan {
            table: table.into(),
            alias: alias.into(),
            final_: false,
        }
    }

    pub fn scan_final(table: impl Into<String>, alias: impl Into<String>) -> Self {
        TableRef::Scan {
            table: table.into(),
            alias: alias.into(),
            final_: true,
        }
    }

+1 −0
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@ pub fn check_ast(node: &Node, ctx: &SecurityContext) -> Result<()> {
            }
            check_query(q, ctx)
        }
        Node::Insert(_) => Ok(()),
    }
}

+150 −9
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@

use gkg_server_config::QueryConfig;

use crate::ast::{ChType, Cte, Expr, JoinType, Node, Op, Query, TableRef};
use crate::ast::{ChType, Cte, Expr, Insert, JoinType, Node, Op, Query, TableRef};
use crate::error::Result;
use crate::passes::enforce::ResultContext;
use serde_json::Value;
@@ -21,11 +21,13 @@ pub fn codegen(
    let mut ctx = Context::new();
    let mut sql = match ast {
        Node::Query(q) => ctx.emit_query(q)?,
        Node::Insert(ins) => ctx.emit_insert(ins),
    };

    // SETTINGS — only on the top-level query, not subqueries/UNION arms.
    // SETTINGS — only on SELECT queries, not INSERT or subqueries/UNION arms.
    // Values are pre-formatted as SQL-safe literals by to_clickhouse_settings()
    // (bare integers, 0/1 bools, escaped quoted strings).
    if matches!(ast, Node::Query(_)) {
        let settings = query_config
            .to_clickhouse_settings()
            .map_err(crate::error::QueryError::Codegen)?;
@@ -33,6 +35,7 @@ pub fn codegen(
            let clause: Vec<String> = settings.iter().map(|(k, v)| format!("{k} = {v}")).collect();
            sql.push_str(&format!(" SETTINGS {}", clause.join(", ")));
        }
    }

    Ok(ParameterizedQuery {
        sql,
@@ -43,6 +46,24 @@ pub fn codegen(
    })
}

/// Emit a `Query` (or `Insert`) AST as parameterized ClickHouse SQL without
/// requiring `ResultContext` or `QueryConfig`.
///
/// # Trust boundary
///
/// This function bypasses the compiler security pipeline (`apply_security_context`,
/// `check_ast`, `enforce_return`). It must only be used for trusted, internally
/// constructed ASTs (e.g. schema version management DDL/DML), never for
/// user-supplied query input.
pub fn emit_simple_query(node: &Node) -> Result<(String, HashMap<String, ParamValue>)> {
    let mut ctx = Context::new();
    let sql = match node {
        Node::Query(q) => ctx.emit_query(q)?,
        Node::Insert(ins) => ctx.emit_insert(ins),
    };
    Ok((sql, ctx.params))
}

struct Context {
    params: HashMap<String, ParamValue>,
}
@@ -54,6 +75,24 @@ impl Context {
        }
    }

    fn emit_insert(&mut self, ins: &Insert) -> String {
        let cols = ins.columns().join(", ");
        let rows: Vec<String> = ins
            .values()
            .iter()
            .map(|row| {
                let exprs: Vec<String> = row.iter().map(|e| self.emit_expr(e)).collect();
                format!("({})", exprs.join(", "))
            })
            .collect();
        format!(
            "INSERT INTO {} ({}) VALUES {}",
            ins.table(),
            cols,
            rows.join(", ")
        )
    }

    fn emit_query(&mut self, q: &Query) -> Result<String> {
        let mut parts = Vec::new();

@@ -258,7 +297,17 @@ impl Context {

    fn emit_table_ref(&mut self, t: &TableRef) -> Result<String> {
        match t {
            TableRef::Scan { table, alias } => Ok(format!("{table} AS {alias}")),
            TableRef::Scan {
                table,
                alias,
                final_,
            } => {
                if *final_ {
                    Ok(format!("{table} FINAL AS {alias}"))
                } else {
                    Ok(format!("{table} AS {alias}"))
                }
            }
            TableRef::Join {
                join_type,
                left,
@@ -800,6 +849,98 @@ mod tests {
        assert!(result.sql.contains(") AS all_edges"));
    }

    #[test]
    fn insert_values() {
        let ins = Insert::new(
            "gl_schema_versions",
            vec!["key".into(), "version".into()],
            vec![
                vec![Expr::string("graph"), Expr::int(3)],
                vec![Expr::string("datalake"), Expr::int(1)],
            ],
        );

        let result = codegen(
            &Node::Insert(Box::new(ins)),
            empty_ctx(),
            QueryConfig::default(),
        )
        .unwrap();
        assert_eq!(
            result.sql,
            "INSERT INTO gl_schema_versions (key, version) VALUES ({p0:String}, {p1:Int64}), ({p2:String}, {p3:Int64})"
        );
        assert_eq!(
            result.params.get("p0").map(|p| &p.value),
            Some(&Value::String("graph".into()))
        );
        assert_eq!(
            result.params.get("p1").map(|p| &p.value),
            Some(&Value::Number(3.into()))
        );
    }

    #[test]
    fn insert_skips_settings() {
        let ins = Insert::new("t", vec!["a".into()], vec![vec![Expr::int(1)]]);

        let cfg = QueryConfig {
            max_execution_time: None,
            use_query_cache: Some(true),
            query_cache_ttl: Some(60),
        };
        let result = codegen(&Node::Insert(Box::new(ins)), empty_ctx(), cfg).unwrap();
        assert!(
            !result.sql.contains("SETTINGS"),
            "INSERT should not have SETTINGS: {}",
            result.sql,
        );
    }

    #[test]
    fn scan_final() {
        let q = Query {
            select: vec![SelectExpr::new(Expr::col("t", "id"), "id")],
            from: TableRef::scan_final("gl_schema_versions", "t"),
            ..Default::default()
        };

        let result = codegen(
            &Node::Query(Box::new(q)),
            empty_ctx(),
            QueryConfig::default(),
        )
        .unwrap();
        assert_eq!(
            result.sql,
            "SELECT t.id AS id FROM gl_schema_versions FINAL AS t"
        );
    }

    #[test]
    fn emit_simple_query_with_final_and_params() {
        let q = Query {
            select: vec![
                SelectExpr::new(Expr::col("t", "key"), "key"),
                SelectExpr::new(Expr::col("t", "version"), "version"),
            ],
            from: TableRef::scan_final("gl_schema_versions", "t"),
            where_clause: Some(Expr::eq(Expr::col("t", "key"), Expr::string("graph"))),
            ..Default::default()
        };

        let (sql, params) = emit_simple_query(&Node::Query(Box::new(q))).unwrap();
        assert_eq!(
            sql,
            "SELECT t.key AS key, t.version AS version FROM gl_schema_versions FINAL AS t WHERE (t.key = {p0:String})"
        );
        assert_eq!(params.len(), 1);
        assert_eq!(
            params.get("p0").map(|p| &p.value),
            Some(&Value::String("graph".into()))
        );
    }

    #[test]
    fn render_replaces_scalar_params() {
        let mut params = HashMap::new();
Loading