EastBay DuckDB: April 2023

-- RANGE returns a list of ints or timestamps.

-- select range(n); -- list of ints from [0,n)

-- select range(n,m); -- list of ints from [m,n)

-- select range(n,m,k); -- list of ints from [m,n) with interval k

-- generate_series does the same, but include the stop value, e.g. [0,n].

select range(5) union

select range(3,4) union

select range(0,10,2) union

select generate_series(5);

┌────────────────────┐

│ range(5) │

│ int64[] │

├────────────────────┤

│ [0, 1, 2, 3, 4, 5] │

│ [0, 2, 4, 6, 8] │

│ [3] │

│ [0, 1, 2, 3, 4] │

└────────────────────┘

-- timestamps require a start, stop, and interval.

select range(timestamp '2023-01-01', timestamp '2023-01-03', interval 1 day) as x

union

select generate_series(timestamp '2023-01-01', timestamp '2023-01-03', interval 1 day) as x;

┌─────────────────────────────────────────────────────────────────┐

│ x │

│ timestamp[] │

├─────────────────────────────────────────────────────────────────┤

│ [2023-01-01 00:00:00, 2023-01-02 00:00:00] │

│ [2023-01-01 00:00:00, 2023-01-02 00:00:00, 2023-01-03 00:00:00] │

└─────────────────────────────────────────────────────────────────┘

-- unnest() can be used to convert the list into rows.

select unnest(range(3)) as using_unnest;

┌──────────────┐

│ using_unnest │

│ int64 │

├──────────────┤

│ 0 │

│ 1 │

│ 2 │

└──────────────┘

-- select can also be used to convert the list into rows.

-- to name the column from a select, use AS _(name).

-- select * from unnest(range(2)) as x; -- doesn't work

select * from unnest(range(2)) as _(using_select);

┌──────────────┐

│ using_select │

│ int64 │

├──────────────┤

│ 0 │

│ 1 │

└──────────────┘

-- use with/select/unnest to generate columns of different types

with t as (select * from range(3) _(x))

select x, 'foo'||cast(x as text) as x_as_text,

x*.01 as x_as_decimal from t;

┌───────┬───────────┬───────────────┐

│ x │ x_as_text │ x_as_decimal │

│ int64 │ varchar │ decimal(21,2) │

├───────┼───────────┼───────────────┤

│ 0 │ foo0 │ 0.00 │

│ 1 │ foo1 │ 0.01 │

│ 2 │ foo2 │ 0.02 │

└───────┴───────────┴───────────────┘

-- Interval can be either hard coded or specified as a string

select range(timestamp '2023-01-01', timestamp '2023-01-03', interval 1 day) as x

union

select range(timestamp '2023-01-01', timestamp '2023-01-03', interval '1 day') as x;

┌────────────────────────────────────────────┐

│ x │

│ timestamp[] │

├────────────────────────────────────────────┤

│ [2023-01-01 00:00:00, 2023-01-02 00:00:00] │

└────────────────────────────────────────────┘

-- Use POSITIONAL JOIN to combine ranges. A not null check can be useful.

select * from

range(timestamp '2023-01-01', timestamp '2023-01-04', interval 1 day) as d(ts)

positional join

range(11,44,11) as t(x);

┌─────────────────────┬───────┐

│ ts │ x │

│ timestamp │ int64 │

├─────────────────────┼───────┤

│ 2023-01-01 00:00:00 │ 11 │

│ 2023-01-02 00:00:00 │ 22 │

│ 2023-01-03 00:00:00 │ 33 │

└─────────────────────┴───────┘

select * from

range(timestamp '2023-01-01', timestamp '2023-01-04', interval 1 day) as d(ts)

positional join range(9999) as t(x)

where ts is not null;

┌─────────────────────┬───────┐

│ ts │ x │

│ timestamp │ int64 │

├─────────────────────┼───────┤

│ 2023-01-01 00:00:00 │ 0 │

│ 2023-01-02 00:00:00 │ 1 │

│ 2023-01-03 00:00:00 │ 2 │

└─────────────────────┴───────┘

-- Combine all of the above to generate data sets.

with t as (

select * from

range(timestamp '2023-01-01', timestamp '2023-01-04', interval 1 day) as foo1(ts)

positional join range(999) as foo2(x)

where ts is not null

)

select ts, x, 100-x as y, x*2.2 as z, 'foo'||x::text as text_x

from t;

┌─────────────────────┬───────┬───────┬───────────────┬─────────┐

│ ts │ x │ y │ z │ text_x │

│ timestamp │ int64 │ int64 │ decimal(21,1) │ varchar │

├─────────────────────┼───────┼───────┼───────────────┼─────────┤

│ 2023-01-01 00:00:00 │ 0 │ 100 │ 0.0 │ foo0 │

│ 2023-01-02 00:00:00 │ 1 │ 99 │ 2.2 │ foo1 │

│ 2023-01-03 00:00:00 │ 2 │ 98 │ 4.4 │ foo2 │

└─────────────────────┴───────┴───────┴───────────────┴─────────┘

(this is a placeholder, I will format the following more beautifully later)

Files are here: https://github.com/marhar/duckdb_tools/tree/main/full-text-shakespeare

mh duckdb_tools/full-text-shakespeare% cat generate-shakespeare-csv.sql

.echo on

-- We want to create a simple "flat" table from the nested structure here:

-- 'https://raw.githubusercontent.com/grokify/kibana-tutorial-go/master/shakespeare.json',

-- In addition, we need to generate a unique identifier for each line, since that is

-- required for full text search to work properly.

create table corpus as select 'x' as line_id, play_name, line_number,speaker, text_entry

from read_json(

'https://raw.githubusercontent.com/grokify/kibana-tutorial-go/master/shakespeare.json',

auto_detect=true)

where play_name is not null and line_number is not null and speaker is not null and text_entry is not null;

-- Get rid of stage directions, Act and Scene number headers, etc.

delete from corpus where line_number = '';

-- Chat GPT claims these abbreviations are commonly used by Shakespeare scholars,

-- and helpfully provides this information in the form of an update.

-- Hope Chat GPT isn't hallucinating about this!

UPDATE corpus

SET line_id =

CASE

WHEN play_name = 'Alls well that ends well' THEN 'AWW/' || line_number

WHEN play_name = 'Antony and Cleopatra' THEN 'A&C/' || line_number

WHEN play_name = 'As you like it' THEN 'AYLI/' || line_number

WHEN play_name = 'A Comedy of Errors' THEN 'CE/' || line_number

WHEN play_name = 'Coriolanus' THEN 'COR/' || line_number

WHEN play_name = 'Cymbeline' THEN 'CYM/' || line_number

WHEN play_name = 'Hamlet' THEN 'HAM/' || line_number

WHEN play_name = 'Henry IV' THEN 'H4/' || line_number

WHEN play_name = 'Henry VI Part 1' THEN '1H6/' || line_number

WHEN play_name = 'Henry VI Part 2' THEN '2H6/' || line_number

WHEN play_name = 'Henry VI Part 3' THEN '3H6/' || line_number

WHEN play_name = 'Henry V' THEN 'H5/' || line_number

WHEN play_name = 'Henry VIII' THEN 'H8/' || line_number

WHEN play_name = 'Julius Caesar' THEN 'JC/' || line_number

WHEN play_name = 'King John' THEN 'KJ/' || line_number

WHEN play_name = 'King Lear' THEN 'KL/' || line_number

WHEN play_name = 'Loves Labours Lost' THEN 'LLL/' || line_number

WHEN play_name = 'Macbeth' THEN 'MAC/' || line_number

WHEN play_name = 'Measure for measure' THEN 'MM/' || line_number

WHEN play_name = 'Merchant of Venice' THEN 'MV/' || line_number

WHEN play_name = 'Merry Wives of Windsor' THEN 'MWW/' || line_number

WHEN play_name = 'A Midsummer nights dream' THEN 'MSND/' || line_number

WHEN play_name = 'Much Ado about nothing' THEN 'MAAN/' || line_number

WHEN play_name = 'Othello' THEN 'OTH/' || line_number

WHEN play_name = 'Pericles' THEN 'PER/' || line_number

WHEN play_name = 'Richard II' THEN 'R2/' || line_number

WHEN play_name = 'Richard III' THEN 'R3/' || line_number

WHEN play_name = 'Romeo and Juliet' THEN 'R&J/' || line_number

WHEN play_name = 'Taming of the Shrew' THEN 'TOTS/' || line_number

WHEN play_name = 'The Tempest' THEN 'TEMP/' || line_number

ELSE 'unknown/' || line_number

END;

-- Finally, save the data in whatever format you like.

-- Or just use it in-memory!

copy corpus TO 'shakespeare.csv' (header, delimiter '|');

copy corpus TO 'shakespeare.parquet' (format parquet, compression zstd);

mh duckdb_tools/full-text-shakespeare%

EastBay DuckDB

Friday, April 28, 2023

DuckDB RANGE command

Wednesday, April 26, 2023

Generating a Shakespeare corpus for full-text searching from JSON

DuckLake ATTACH quick reference.

Report Abuse

Labels