Sunday, August 20, 2023

Setting up DuckDB/Rust

Here's what I did to get DuckDB/rust going on my system.

Prerequisites

We need two things from DuckDB: header files (so we can build duckdb-rs) and the library (so duckdb-rs can call the DuckDB functions).  We will get these by building DuckDB.  At the time of writing , version 0.8.1 is the DuckDB version supported by duckdb-rs.

We will clone the DuckDB repo, check out version 0.8.1, and build it.

mkdir $HOME/duck81

cd $HOME/duck81

git clone https://github.com/duckdb/duckdb.git

cd duckdb

git checkout tags/v0.8.1

make

Next, we need a couple of environment variables.  These are used when crate builds the duckdb-rs crate.

export DUCKDB_LIB_DIR=$HOME/duck81/duckdb/build/release/src

export DUCKDB_INCLUDE_DIR=$HOME/duck81/duckdb/src/include


Finally, we need to tell the OS where the dynamic libraries are located.

# macos

export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$DUCKDB_LIB_DIR

# linux

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$DUCKDB_LIB_DIR

This setup is fine for development but not for production use.  If you are shipping rust in production, you know how to package shared libraries, etc.  There are some packaging notes on the duckdb-rs github page.

Cargo.toml

Add this line to your dependencies section.

[dependencies]


duckdb = "0.8.1"

Sample Code

Sample Code

Here's a lightly modified version of the example code on the duckdb-rs github page.

use duckdb::{params, Connection, Result};


use duckdb::arrow::record_batch::RecordBatch;

use duckdb::arrow::util::pretty::print_batches;


#[derive(Debug)]

struct Person {

    id: i32,

    name: String,

    data: Option<Vec<u8>>,

}


fn main() -> Result<()> {

    let conn = Connection::open_in_memory()?;


    conn.execute_batch(r"

        CREATE SEQUENCE seq;

        CREATE TABLE person (

          id    INTEGER PRIMARY KEY DEFAULT NEXTVAL('seq'),

          name  TEXT NOT NULL,

          data  BLOB

                );

        ")?;


    conn.execute("INSERT INTO person (name, data) VALUES (?, ?)",

                 params!["able", "aaa"],)?;

    conn.execute("INSERT INTO person (name, data) VALUES (?, ?)",

                 params!["baker", "bbb"],)?;

    conn.execute("INSERT INTO person (name, data) VALUES (?, ?)",

                 params!["charlie", "ccc"],)?;

    conn.execute("INSERT INTO person (name, data) VALUES (?, ?)",

                 params!["dog", "ddd"],)?;


    println!("query table by rows");

    let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;

    let person_iter = stmt.query_map([], |row| {

        Ok(Person {

            id: row.get(0)?,

            name: row.get(1)?,

            data: row.get(2)?,

        })

    })?;


    for person in person_iter {

        println!("Found person {:?}", person.unwrap());

    }


    println!("query table by arrow");

    let rbs: Vec<RecordBatch> = stmt.query_arrow([])?.collect();

    print_batches(&rbs).unwrap();

    Ok(())

}

Execution

We can ignore the (incorrect) warning about the People fields never being read.

% cargo run


   Compiling duck1 v0.1.0 (/Users/neologic/biblio/anachron/duck1)

warning: fields `id`, `name`, and `data` are never read

 --> src/main.rs:7:5

  |

6 | struct Person {

  |        ------ fields in this struct

7 |     id: i32,

  |     ^^

8 |     name: String,

  |     ^^^^

9 |     data: Option<Vec<u8>>,

  |     ^^^^

  |

  = note: `Person` has a derived impl for the trait `Debug`, but this is intentionally ignored during dead code analysis

  = note: `#[warn(dead_code)]` on by default


warning: `duck1` (bin "duck1") generated 1 warning

    Finished dev [unoptimized + debuginfo] target(s) in 1.24s

     Running `target/debug/duck1`


query table by rows

Found person Person { id: 1, name: "able", data: Some([97, 97, 97]) }

Found person Person { id: 2, name: "baker", data: Some([98, 98, 98]) }

Found person Person { id: 3, name: "charlie", data: Some([99, 99, 99]) }

Found person Person { id: 4, name: "dog", data: Some([100, 100, 100]) }


query table by arrow

+----+---------+--------+

| id | name    | data   |

+----+---------+--------+

| 1  | able    | 616161 |

| 2  | baker   | 626262 |

| 3  | charlie | 636363 |

| 4  | dog     | 646464 |

+----+---------+--------+


Wednesday, May 24, 2023

Query Diffing

Given tables or query results A and B, we want to identify:

  • inserted rows:  The key appears in B, but not in A.
  • deleted rows:  The key appears in A, but not in B.
  • modified: The key appears in both tables, but with different values.
  • unmodified:  the key appears in both tables, and the values are the same.

For example, given the data below, identify:

  • a is modified, 1 --> 9
  • b is deleted
  • c is unmodified
  • d is inserted

CREATE TABLE A AS

          SELECT 'a' AS key, 1 AS val

    UNION SELECT 'b', 2

    UNION SELECT 'c', 3;


CREATE TABLE b AS 

          SELECT 'a' AS key, 9 AS val

    UNION SELECT 'c', 3

    UNION SELECT 'd', 4;


SELECT * FROM a ORDER BY KEY;

a,1

b,2

c,3


SELECT * FROM b ORDER BY KEY;

a,9

c,3

d,4

These queries will perform the specified operations.

Modified Rows

select b.key, b.val from a left join b

    on a.key = b.key where a.val <> b.val;

a,9

Deleted Rows

select key, val from a where key not in (select key from b);

b,2

Unmodified Rows

select key, val from a intersect select key, val from b;

c,3

 Inserted Rows

select key, val from b where key not in (select key from a);

d,4

Note that if you have multiple columns in your key, you can specify them as (`key1, key2, ...), etc.

For example, if the tables had two key columns the Inserted query would be

select key1, key2, val from b

    where (key1, key2) not in (select key1, key2 from a);

Friday, May 19, 2023

DuckDB Date Sampler



WITH generate_date AS (

  SELECT CAST(RANGE AS DATE) AS date_key  

    FROM RANGE(DATE '2009-01-01', DATE '2013-12-31', INTERVAL 1 DAY)

)

SELECT date_key AS date_key,

  DAYOFYEAR(date_key) AS day_of_year, 

  YEARWEEK(date_key) AS week_key,

  WEEKOFYEAR(date_key) AS week_of_year,

  DAYOFWEEK(date_key) AS day_of_week,

  ISODOW(date_key) AS iso_day_of_week,

  DAYNAME(date_key) AS day_name,

  DATE_TRUNC('week', date_key) AS first_day_of_week,

  DATE_TRUNC('week', date_key) + 6 AS last_day_of_week,

  YEAR(date_key) || RIGHT('0' || MONTH(date_key), 2) AS month_key,

  MONTH(date_key) AS month_of_year,

  DAYOFMONTH(date_key) AS day_of_month,

  LEFT(MONTHNAME(date_key), 3) AS month_name_short,

  MONTHNAME(date_key) AS month_name,

  DATE_TRUNC('month', date_key) AS first_day_of_month,

  LAST_DAY(date_key) AS last_day_of_month,

  CAST(YEAR(date_key) || QUARTER(date_key) AS INT) AS quarter_key,

  QUARTER(date_key) AS quarter_of_year,

  CAST(date_key - DATE_TRUNC('Quarter', date_key) + 1 AS INT) AS day_of_quarter,

  ('Q' || QUARTER(date_key)) AS quarter_desc_short,

  ('Quarter ' || QUARTER(date_key)) AS quarter_desc,

  DATE_TRUNC('quarter', date_key) AS first_day_of_quarter,

  LAST_DAY(DATE_TRUNC('quarter', date_key) + INTERVAL 2 MONTH) as last_day_of_quarter,

  CAST(YEAR(date_key) AS INT) AS year_key,

  DATE_TRUNC('Year', date_key) AS first_day_of_year,

  DATE_TRUNC('Year', date_key) - 1 + INTERVAL 1 YEAR AS last_day_of_year,

  ROW_NUMBER() OVER 

   (PARTITION BY YEAR(date_key), MONTH(date_key), DAYOFWEEK(date_key) ORDER BY date_key) 

   AS ordinal_weekday_of_month

FROM generate_date;


(credit)

Friday, April 28, 2023

DuckDB RANGE command


-- RANGE returns a list of ints or timestamps.

--  select range(n);       -- list of ints from [0,n)

--  select range(n,m);     -- list of ints from [m,n)

--  select range(n,m,k);  -- list of ints from [m,n) with interval k

--  generate_series does the same, but include the stop value, e.g. [0,n].


select range(5)             union

select range(3,4)           union

select range(0,10,2)        union

select generate_series(5);

┌────────────────────┐

      range(5)     

      int64[]      

├────────────────────┤

│ [0, 1, 2, 3, 4, 5] │

│ [0, 2, 4, 6, 8]   

│ [3]               

│ [0, 1, 2, 3, 4]   

└────────────────────┘


-- timestamps require a start, stop, and interval.

select range(timestamp '2023-01-01', timestamp '2023-01-03', interval 1 day) as x

union

select generate_series(timestamp '2023-01-01', timestamp '2023-01-03', interval 1 day) as x;

┌─────────────────────────────────────────────────────────────────┐

                                x                               

                          timestamp[]                          

├─────────────────────────────────────────────────────────────────┤

│ [2023-01-01 00:00:00, 2023-01-02 00:00:00]                     

│ [2023-01-01 00:00:00, 2023-01-02 00:00:00, 2023-01-03 00:00:00] │

└─────────────────────────────────────────────────────────────────┘


-- unnest() can be used to convert the list into rows.

select unnest(range(3)) as using_unnest;

┌──────────────┐

│ using_unnest │

    int64    

├──────────────┤

            0 │

            1 │

            2 │

└──────────────┘


-- select can also be used to convert the list into rows.

-- to name the column from a select, use AS _(name).

--   select * from unnest(range(2)) as x;   -- doesn't work

select * from unnest(range(2)) as _(using_select);

┌──────────────┐

│ using_select │

    int64    

├──────────────┤

            0 │

            1 │

└──────────────┘


-- use with/select/unnest to generate columns of different types


with t as (select * from range(3) _(x))

  select x, 'foo'||cast(x as text) as x_as_text,

         x*.01 as x_as_decimal from t;

┌───────┬───────────┬───────────────┐

  x   │ x_as_text │ x_as_decimal 

│ int64 │  varchar  │ decimal(21,2) │

├───────┼───────────┼───────────────┤

    0 │ foo0                0.00 │

    1 │ foo1                0.01 │

    2 │ foo2                0.02 │

└───────┴───────────┴───────────────┘


-- Interval can be either hard coded or specified as a string

select range(timestamp '2023-01-01', timestamp '2023-01-03', interval 1 day) as x

union

select range(timestamp '2023-01-01', timestamp '2023-01-03', interval '1 day') as x;

┌────────────────────────────────────────────┐

                    x                     

                timestamp[]                

├────────────────────────────────────────────┤

│ [2023-01-01 00:00:00, 2023-01-02 00:00:00] │

└────────────────────────────────────────────┘


-- Use POSITIONAL JOIN to combine ranges.  A not null check can be useful.


select * from 

  range(timestamp '2023-01-01', timestamp '2023-01-04', interval 1 day) as d(ts)

    positional join

  range(11,44,11) as t(x);

┌─────────────────────┬───────┐

        ts            x  

      timestamp      │ int64 │

├─────────────────────┼───────┤

│ 2023-01-01 00:00:00 │    11 │

│ 2023-01-02 00:00:00 │    22 │

│ 2023-01-03 00:00:00 │    33 │

└─────────────────────┴───────┘


select * from 

  range(timestamp '2023-01-01', timestamp '2023-01-04', interval 1 day) as d(ts)

  positional join range(9999) as t(x)

where ts is not null;

┌─────────────────────┬───────┐

        ts            x  

      timestamp      │ int64 │

├─────────────────────┼───────┤

│ 2023-01-01 00:00:00 │     0 │

│ 2023-01-02 00:00:00 │     1 │

│ 2023-01-03 00:00:00 │     2 │

└─────────────────────┴───────┘


-- Combine all of the above to generate data sets.


with t as (

  select * from 

    range(timestamp '2023-01-01', timestamp '2023-01-04', interval 1 day) as foo1(ts)

    positional join range(999) as foo2(x)

  where ts is not null

)

  select ts, x, 100-x as y, x*2.2 as z, 'foo'||x::text as text_x

  from t;

┌─────────────────────┬───────┬───────┬───────────────┬─────────┐

        ts            x     y         z       │ text_x 

      timestamp      │ int64 │ int64 │ decimal(21,1) │ varchar │

├─────────────────────┼───────┼───────┼───────────────┼─────────┤

│ 2023-01-01 00:00:00 │     0 │   100 │           0.0 │ foo0   

│ 2023-01-02 00:00:00 │     1 │    99 │           2.2 │ foo1   

│ 2023-01-03 00:00:00 │     2 │    98 │           4.4 │ foo2   

└─────────────────────┴───────┴───────┴───────────────┴─────────┘

Setting up DuckDB/Rust

Here's what I did to get DuckDB/rust going on my system. Prerequisites We need two things from DuckDB: header files (so we can build duc...