Verified Commit dd848f91 authored by Michael Usachenko's avatar Michael Usachenko Committed by GitLab
Browse files

feat(ontology): generate local DuckDB DDL from ontology

parent 0f2adacd
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -54,7 +54,7 @@ CLI integration tests (concurrency, worktrees): `mise test:cli`.
| Query test fixtures | `fixtures/queries/` |
| Graph DDL (ClickHouse) | `config/graph.sql` |
| Schema version file | `config/SCHEMA_VERSION` (bump when `graph.sql` or `config/ontology/` changes) |
| Graph DDL (local DuckDB) | `config/graph_local.sql` |
| Graph DDL (local DuckDB) | Generated at runtime from ontology via `generate_local_tables()` + `duckdb_ddl` |
| Datalake DDL (ClickHouse) | `fixtures/siphon.sql` |
| gRPC service definition | `crates/gkg-server/proto/gkg.proto` |
| Server config structure | `crates/gkg-server/src/config.rs` |
+1 −1
Original line number Diff line number Diff line
@@ -54,7 +54,7 @@ CLI integration tests (concurrency, worktrees): `mise test:cli`.
| Query test fixtures | `fixtures/queries/` |
| Graph DDL (ClickHouse) | `config/graph.sql` |
| Schema version file | `config/SCHEMA_VERSION` (bump when `graph.sql` or `config/ontology/` changes) |
| Graph DDL (local DuckDB) | `config/graph_local.sql` |
| Graph DDL (local DuckDB) | Generated at runtime from ontology via `generate_local_tables()` + `duckdb_ddl` |
| Datalake DDL (ClickHouse) | `fixtures/siphon.sql` |
| gRPC service definition | `crates/gkg-server/proto/gkg.proto` |
| Server config structure | `crates/gkg-server/src/config.rs` |

config/graph_local.sql

deleted100644 → 0
+0 −81
Original line number Diff line number Diff line
-- DuckDB schema for local code graph tables.
--
-- Mirrors the code-indexing subset of graph.sql (ClickHouse).
-- Differences:
--   - No ENGINE, CODEC, PROJECTION, INDEX, or SETTINGS clauses
--   - No traversal_path — local mode has no multi-tenant namespace scoping
--   - No _version or _deleted columns — local mode does full delete-and-reinsert

-- Manifest: tracks indexed repos and maps repo paths to project IDs.
-- Replaces the JSON manifest file and advisory lock.
CREATE TYPE IF NOT EXISTS repo_status AS ENUM ('pending', 'indexing', 'indexed', 'error');

CREATE TABLE IF NOT EXISTS _orbit_manifest (
    repo_path VARCHAR PRIMARY KEY,
    project_id BIGINT NOT NULL,
    parent_repo_path VARCHAR,
    branch VARCHAR,
    commit_sha VARCHAR,
    status repo_status NOT NULL DEFAULT 'pending',
    last_indexed_at TIMESTAMP,
    error_message VARCHAR
);

CREATE TABLE IF NOT EXISTS gl_directory (
    id BIGINT NOT NULL,
    project_id BIGINT NOT NULL,
    branch VARCHAR NOT NULL,
    commit_sha VARCHAR NOT NULL,
    path VARCHAR NOT NULL,
    name VARCHAR NOT NULL
);

CREATE TABLE IF NOT EXISTS gl_file (
    id BIGINT NOT NULL,
    project_id BIGINT NOT NULL,
    branch VARCHAR NOT NULL,
    commit_sha VARCHAR NOT NULL,
    path VARCHAR NOT NULL,
    name VARCHAR NOT NULL,
    extension VARCHAR NOT NULL DEFAULT '',
    language VARCHAR NOT NULL DEFAULT ''
);

CREATE TABLE IF NOT EXISTS gl_definition (
    id BIGINT NOT NULL,
    project_id BIGINT NOT NULL,
    branch VARCHAR NOT NULL,
    commit_sha VARCHAR NOT NULL,
    file_path VARCHAR NOT NULL,
    fqn VARCHAR NOT NULL,
    name VARCHAR NOT NULL,
    definition_type VARCHAR NOT NULL,
    start_line BIGINT NOT NULL,
    end_line BIGINT NOT NULL,
    start_byte BIGINT NOT NULL,
    end_byte BIGINT NOT NULL
);

CREATE TABLE IF NOT EXISTS gl_imported_symbol (
    id BIGINT NOT NULL,
    project_id BIGINT NOT NULL,
    branch VARCHAR NOT NULL,
    commit_sha VARCHAR NOT NULL,
    file_path VARCHAR NOT NULL,
    import_type VARCHAR NOT NULL,
    import_path VARCHAR NOT NULL,
    identifier_name VARCHAR,
    identifier_alias VARCHAR,
    start_line BIGINT NOT NULL,
    end_line BIGINT NOT NULL,
    start_byte BIGINT NOT NULL,
    end_byte BIGINT NOT NULL
);

CREATE TABLE IF NOT EXISTS gl_edge (
    source_id BIGINT NOT NULL,
    source_kind VARCHAR NOT NULL,
    relationship_kind VARCHAR NOT NULL,
    target_id BIGINT NOT NULL,
    target_kind VARCHAR NOT NULL
);
+15 −1
Original line number Diff line number Diff line
@@ -17,6 +17,19 @@ use std::sync::Arc;
use tracing::{Level, info};
use tracing_subscriber::fmt::format::FmtSpan;

/// Generate the full DuckDB DDL (graph tables + manifest) from the ontology.
fn generate_local_ddl(ontology: &Ontology) -> String {
    let tables = query_engine::compiler::generate_local_tables(ontology);
    let mut ddl = tables
        .iter()
        .map(|t| format!("{};\n", query_engine::compiler::emit_duckdb_create_table(t)))
        .collect::<Vec<_>>()
        .join("\n");
    ddl.push('\n');
    ddl.push_str(duckdb_client::MANIFEST_DDL);
    ddl
}

#[derive(Debug, Clone, Copy, Default, clap::ValueEnum)]
enum OutputFormat {
    #[default]
@@ -393,8 +406,9 @@ async fn run_index(path: PathBuf, threads: usize, show_stats: bool) -> Result<()
        let db_path = store.db_path();
        let client =
            duckdb_client::DuckDbClient::open(&db_path).context("failed to open DuckDB")?;
        let ddl = generate_local_ddl(&ontology);
        client
            .initialize_schema()
            .initialize_schema(&ddl)
            .context("failed to create schema")?;
    }

+68 −22
Original line number Diff line number Diff line
@@ -6,7 +6,6 @@ use duckdb::params;

use crate::converter::LocalGraphData;
use crate::error::{DuckDbError, Result};
use crate::schema::{CODE_GRAPH_TABLES, SCHEMA_DDL};

const MAX_OPEN_RETRIES: u32 = 10;
const INITIAL_BACKOFF: Duration = Duration::from_millis(100);
@@ -79,17 +78,30 @@ impl DuckDbClient {
        Ok(Self { conn })
    }

    pub fn initialize_schema(&self) -> Result<()> {
    /// Create all graph tables and the manifest table from the given DDL.
    ///
    /// The DDL is typically generated from the ontology via
    /// `generate_local_tables` + `emit_duckdb_create_table`, with the
    /// manifest DDL (`MANIFEST_DDL`) appended.
    pub fn initialize_schema(&self, ddl: &str) -> Result<()> {
        self.conn
            .execute_batch(SCHEMA_DDL)
            .execute_batch(ddl)
            .map_err(|e| DuckDbError::Schema(e.to_string()))?;
        Ok(())
    }

    /// Bulk insert via DuckDB's Appender, which converts Arrow RecordBatch
    /// directly to DuckDB DataChunks — no SQL parsing, no vtab overhead.
    pub fn insert_arrow(&self, table: &str, batch: RecordBatch) -> Result<()> {
        if !CODE_GRAPH_TABLES.contains(&table) {
    ///
    /// `allowed_tables` is an allowlist of valid table names. Pass the
    /// table names derived from the ontology's `local_db` config.
    pub fn insert_arrow(
        &self,
        table: &str,
        batch: RecordBatch,
        allowed_tables: &[&str],
    ) -> Result<()> {
        if !allowed_tables.contains(&table) {
            return Err(DuckDbError::Schema(format!("unknown table: {table}")));
        }
        if batch.num_rows() == 0 {
@@ -119,11 +131,11 @@ impl DuckDbClient {
        Ok(batches)
    }

    /// Deletes all data across all tables. In local mode each DB file is
    /// one project, so a full truncate is the correct reset before re-indexing.
    /// Delete all data from all code graph tables.
    pub fn delete_all_data(&self) -> Result<()> {
        for table in CODE_GRAPH_TABLES {
    /// Deletes all data across all graph tables. In local mode each DB file
    /// is one project, so a full truncate is the correct reset before
    /// re-indexing. The manifest table is preserved.
    pub fn delete_all_data(&self, tables: &[&str]) -> Result<()> {
        for table in tables {
            self.conn
                .execute(&format!("DELETE FROM {table}"), params![])?;
        }
@@ -222,6 +234,38 @@ mod tests {
    use arrow::datatypes::{DataType, Field, Schema};
    use std::sync::Arc;

    /// Test DDL covering only the tables these tests exercise.
    const TEST_DDL: &str = "\
CREATE TABLE IF NOT EXISTS gl_directory (
    id BIGINT NOT NULL,
    project_id BIGINT NOT NULL,
    branch VARCHAR NOT NULL,
    commit_sha VARCHAR NOT NULL,
    path VARCHAR NOT NULL,
    name VARCHAR NOT NULL
);

CREATE TABLE IF NOT EXISTS gl_file (
    id BIGINT NOT NULL,
    project_id BIGINT NOT NULL,
    branch VARCHAR NOT NULL,
    commit_sha VARCHAR NOT NULL,
    path VARCHAR NOT NULL,
    name VARCHAR NOT NULL,
    extension VARCHAR,
    language VARCHAR
);

CREATE TABLE IF NOT EXISTS gl_edge (
    source_id BIGINT NOT NULL,
    source_kind VARCHAR NOT NULL,
    relationship_kind VARCHAR NOT NULL,
    target_id BIGINT NOT NULL,
    target_kind VARCHAR NOT NULL
);";

    const TEST_TABLES: &[&str] = &["gl_directory", "gl_file", "gl_edge"];

    fn file_schema() -> Arc<Schema> {
        Arc::new(Schema::new(vec![
            Field::new("id", DataType::Int64, false),
@@ -256,7 +300,7 @@ mod tests {
    #[test]
    fn schema_creation_and_sql_roundtrip() {
        let client = DuckDbClient::open_in_memory().unwrap();
        client.initialize_schema().unwrap();
        client.initialize_schema(TEST_DDL).unwrap();

        client
            .conn
@@ -291,10 +335,10 @@ mod tests {
    #[test]
    fn appender_insert_and_query() {
        let client = DuckDbClient::open_in_memory().unwrap();
        client.initialize_schema().unwrap();
        client.initialize_schema(TEST_DDL).unwrap();

        let batch = make_file_batch(&[10, 11], &["a.rs", "b.rs"]);
        client.insert_arrow("gl_file", batch).unwrap();
        client.insert_arrow("gl_file", batch, TEST_TABLES).unwrap();

        let result = client
            .query_arrow("SELECT id, name FROM gl_file ORDER BY id")
@@ -314,7 +358,7 @@ mod tests {
    #[test]
    fn large_batch_appender() {
        let client = DuckDbClient::open_in_memory().unwrap();
        client.initialize_schema().unwrap();
        client.initialize_schema(TEST_DDL).unwrap();

        let n = 5000;
        let ids: Vec<i64> = (0..n).collect();
@@ -322,7 +366,7 @@ mod tests {
        let name_refs: Vec<&str> = names.iter().map(|s| s.as_str()).collect();

        let batch = make_file_batch(&ids, &name_refs);
        client.insert_arrow("gl_file", batch).unwrap();
        client.insert_arrow("gl_file", batch, TEST_TABLES).unwrap();

        let result = client
            .query_arrow("SELECT count(*) as cnt FROM gl_file")
@@ -338,7 +382,7 @@ mod tests {
    #[test]
    fn delete_all_data_truncates() {
        let client = DuckDbClient::open_in_memory().unwrap();
        client.initialize_schema().unwrap();
        client.initialize_schema(TEST_DDL).unwrap();

        client
            .conn
@@ -355,7 +399,7 @@ mod tests {
            )
            .unwrap();

        client.delete_all_data().unwrap();
        client.delete_all_data(TEST_TABLES).unwrap();

        let batches = client
            .query_arrow("SELECT count(*) as cnt FROM gl_file")
@@ -374,7 +418,7 @@ mod tests {
        let db_path = dir.path().join("test.duckdb");

        let client = DuckDbClient::open(&db_path).unwrap();
        client.initialize_schema().unwrap();
        client.initialize_schema(TEST_DDL).unwrap();
        client
            .conn
            .execute(
@@ -401,20 +445,22 @@ mod tests {
    #[test]
    fn insert_arrow_rejects_unknown_table() {
        let client = DuckDbClient::open_in_memory().unwrap();
        client.initialize_schema().unwrap();
        client.initialize_schema(TEST_DDL).unwrap();

        let batch = make_file_batch(&[1], &["a.rs"]);
        let err = client.insert_arrow("evil_table", batch).unwrap_err();
        let err = client
            .insert_arrow("evil_table", batch, TEST_TABLES)
            .unwrap_err();
        assert!(err.to_string().contains("unknown table"));
    }

    #[test]
    fn insert_empty_batch_is_noop() {
        let client = DuckDbClient::open_in_memory().unwrap();
        client.initialize_schema().unwrap();
        client.initialize_schema(TEST_DDL).unwrap();

        let batch = make_file_batch(&[], &[]);
        client.insert_arrow("gl_file", batch).unwrap();
        client.insert_arrow("gl_file", batch, TEST_TABLES).unwrap();

        let result = client
            .query_arrow("SELECT count(*) as cnt FROM gl_file")
Loading