Friday, April 28, 2023

DuckDB RANGE command


-- RANGE returns a list of ints or timestamps.

--  select range(n);       -- list of ints from [0,n)

--  select range(n,m);     -- list of ints from [m,n)

--  select range(n,m,k);  -- list of ints from [m,n) with interval k

--  generate_series does the same, but include the stop value, e.g. [0,n].


select range(5)             union

select range(3,4)           union

select range(0,10,2)        union

select generate_series(5);

┌────────────────────┐

      range(5)     

      int64[]      

├────────────────────┤

│ [0, 1, 2, 3, 4, 5] │

│ [0, 2, 4, 6, 8]   

│ [3]               

│ [0, 1, 2, 3, 4]   

└────────────────────┘


-- timestamps require a start, stop, and interval.

select range(timestamp '2023-01-01', timestamp '2023-01-03', interval 1 day) as x

union

select generate_series(timestamp '2023-01-01', timestamp '2023-01-03', interval 1 day) as x;

┌─────────────────────────────────────────────────────────────────┐

                                x                               

                          timestamp[]                          

├─────────────────────────────────────────────────────────────────┤

│ [2023-01-01 00:00:00, 2023-01-02 00:00:00]                     

│ [2023-01-01 00:00:00, 2023-01-02 00:00:00, 2023-01-03 00:00:00] │

└─────────────────────────────────────────────────────────────────┘


-- unnest() can be used to convert the list into rows.

select unnest(range(3)) as using_unnest;

┌──────────────┐

│ using_unnest │

    int64    

├──────────────┤

            0 │

            1 │

            2 │

└──────────────┘


-- select can also be used to convert the list into rows.

-- to name the column from a select, use AS _(name).

--   select * from unnest(range(2)) as x;   -- doesn't work

select * from unnest(range(2)) as _(using_select);

┌──────────────┐

│ using_select │

    int64    

├──────────────┤

            0 │

            1 │

└──────────────┘


-- use with/select/unnest to generate columns of different types


with t as (select * from range(3) _(x))

  select x, 'foo'||cast(x as text) as x_as_text,

         x*.01 as x_as_decimal from t;

┌───────┬───────────┬───────────────┐

  x   │ x_as_text │ x_as_decimal 

│ int64 │  varchar  │ decimal(21,2) │

├───────┼───────────┼───────────────┤

    0 │ foo0                0.00 │

    1 │ foo1                0.01 │

    2 │ foo2                0.02 │

└───────┴───────────┴───────────────┘


-- Interval can be either hard coded or specified as a string

select range(timestamp '2023-01-01', timestamp '2023-01-03', interval 1 day) as x

union

select range(timestamp '2023-01-01', timestamp '2023-01-03', interval '1 day') as x;

┌────────────────────────────────────────────┐

                    x                     

                timestamp[]                

├────────────────────────────────────────────┤

│ [2023-01-01 00:00:00, 2023-01-02 00:00:00] │

└────────────────────────────────────────────┘


-- Use POSITIONAL JOIN to combine ranges.  A not null check can be useful.


select * from 

  range(timestamp '2023-01-01', timestamp '2023-01-04', interval 1 day) as d(ts)

    positional join

  range(11,44,11) as t(x);

┌─────────────────────┬───────┐

        ts            x  

      timestamp      │ int64 │

├─────────────────────┼───────┤

│ 2023-01-01 00:00:00 │    11 │

│ 2023-01-02 00:00:00 │    22 │

│ 2023-01-03 00:00:00 │    33 │

└─────────────────────┴───────┘


select * from 

  range(timestamp '2023-01-01', timestamp '2023-01-04', interval 1 day) as d(ts)

  positional join range(9999) as t(x)

where ts is not null;

┌─────────────────────┬───────┐

        ts            x  

      timestamp      │ int64 │

├─────────────────────┼───────┤

│ 2023-01-01 00:00:00 │     0 │

│ 2023-01-02 00:00:00 │     1 │

│ 2023-01-03 00:00:00 │     2 │

└─────────────────────┴───────┘


-- Combine all of the above to generate data sets.


with t as (

  select * from 

    range(timestamp '2023-01-01', timestamp '2023-01-04', interval 1 day) as foo1(ts)

    positional join range(999) as foo2(x)

  where ts is not null

)

  select ts, x, 100-x as y, x*2.2 as z, 'foo'||x::text as text_x

  from t;

┌─────────────────────┬───────┬───────┬───────────────┬─────────┐

        ts            x     y         z       │ text_x 

      timestamp      │ int64 │ int64 │ decimal(21,1) │ varchar │

├─────────────────────┼───────┼───────┼───────────────┼─────────┤

│ 2023-01-01 00:00:00 │     0 │   100 │           0.0 │ foo0   

│ 2023-01-02 00:00:00 │     1 │    99 │           2.2 │ foo1   

│ 2023-01-03 00:00:00 │     2 │    98 │           4.4 │ foo2   

└─────────────────────┴───────┴───────┴───────────────┴─────────┘

Wednesday, April 26, 2023

Generating a Shakespeare corpus for full-text searching from JSON

 (this is a placeholder, I will format the following more beautifully later)

Files are here: https://github.com/marhar/duckdb_tools/tree/main/full-text-shakespeare

mh duckdb_tools/full-text-shakespeare% cat generate-shakespeare-csv.sql

.echo on

-- We want to create a simple "flat" table from the nested structure here:

--    'https://raw.githubusercontent.com/grokify/kibana-tutorial-go/master/shakespeare.json',

--

-- In addition, we need to generate a unique identifier for each line, since that is

-- required for full text search to work properly.


create table corpus as select 'x' as line_id, play_name, line_number,speaker, text_entry

  from read_json(

         'https://raw.githubusercontent.com/grokify/kibana-tutorial-go/master/shakespeare.json',

         auto_detect=true)

   where play_name is not null and line_number is not null and speaker is not null and text_entry is not null;


-- Get rid of stage directions, Act and Scene number headers, etc.


delete from corpus where line_number = '';


-- Chat GPT claims these abbreviations are commonly used by Shakespeare scholars,

-- and helpfully provides this information in the form of an update.

-- Hope Chat GPT isn't hallucinating about this!


UPDATE corpus

SET line_id = 

    CASE 

        WHEN play_name = 'Alls well that ends well' THEN 'AWW/' || line_number

        WHEN play_name = 'Antony and Cleopatra' THEN 'A&C/' || line_number

        WHEN play_name = 'As you like it' THEN 'AYLI/' || line_number

        WHEN play_name = 'A Comedy of Errors' THEN 'CE/' || line_number

        WHEN play_name = 'Coriolanus' THEN 'COR/' || line_number

        WHEN play_name = 'Cymbeline' THEN 'CYM/' || line_number

        WHEN play_name = 'Hamlet' THEN 'HAM/' || line_number

        WHEN play_name = 'Henry IV' THEN 'H4/' || line_number

        WHEN play_name = 'Henry VI Part 1' THEN '1H6/' || line_number

        WHEN play_name = 'Henry VI Part 2' THEN '2H6/' || line_number

        WHEN play_name = 'Henry VI Part 3' THEN '3H6/' || line_number

        WHEN play_name = 'Henry V' THEN 'H5/' || line_number

        WHEN play_name = 'Henry VIII' THEN 'H8/' || line_number

        WHEN play_name = 'Julius Caesar' THEN 'JC/' || line_number

        WHEN play_name = 'King John' THEN 'KJ/' || line_number

        WHEN play_name = 'King Lear' THEN 'KL/' || line_number

        WHEN play_name = 'Loves Labours Lost' THEN 'LLL/' || line_number

        WHEN play_name = 'Macbeth' THEN 'MAC/' || line_number

        WHEN play_name = 'Measure for measure' THEN 'MM/' || line_number

        WHEN play_name = 'Merchant of Venice' THEN 'MV/' || line_number

        WHEN play_name = 'Merry Wives of Windsor' THEN 'MWW/' || line_number

        WHEN play_name = 'A Midsummer nights dream' THEN 'MSND/' || line_number

        WHEN play_name = 'Much Ado about nothing' THEN 'MAAN/' || line_number

        WHEN play_name = 'Othello' THEN 'OTH/' || line_number

        WHEN play_name = 'Pericles' THEN 'PER/' || line_number

        WHEN play_name = 'Richard II' THEN 'R2/' || line_number

        WHEN play_name = 'Richard III' THEN 'R3/' || line_number

        WHEN play_name = 'Romeo and Juliet' THEN 'R&J/' || line_number

        WHEN play_name = 'Taming of the Shrew' THEN 'TOTS/' || line_number

        WHEN play_name = 'The Tempest' THEN 'TEMP/' || line_number

        ELSE 'unknown/' || line_number

    END;


-- Finally, save the data in whatever format you like.

-- Or just use it in-memory!


copy corpus TO 'shakespeare.csv' (header, delimiter '|');

copy corpus TO 'shakespeare.parquet' (format parquet, compression zstd);

mh duckdb_tools/full-text-shakespeare% 

Setting up DuckDB/Rust

Here's what I did to get DuckDB/rust going on my system. Prerequisites We need two things from DuckDB: header files (so we can build duc...