Wednesday, April 26, 2023

Generating a Shakespeare corpus for full-text searching from JSON

 (this is a placeholder, I will format the following more beautifully later)

Files are here: https://github.com/marhar/duckdb_tools/tree/main/full-text-shakespeare

mh duckdb_tools/full-text-shakespeare% cat generate-shakespeare-csv.sql

.echo on

-- We want to create a simple "flat" table from the nested structure here:

--    'https://raw.githubusercontent.com/grokify/kibana-tutorial-go/master/shakespeare.json',

--

-- In addition, we need to generate a unique identifier for each line, since that is

-- required for full text search to work properly.


create table corpus as select 'x' as line_id, play_name, line_number,speaker, text_entry

  from read_json(

         'https://raw.githubusercontent.com/grokify/kibana-tutorial-go/master/shakespeare.json',

         auto_detect=true)

   where play_name is not null and line_number is not null and speaker is not null and text_entry is not null;


-- Get rid of stage directions, Act and Scene number headers, etc.


delete from corpus where line_number = '';


-- Chat GPT claims these abbreviations are commonly used by Shakespeare scholars,

-- and helpfully provides this information in the form of an update.

-- Hope Chat GPT isn't hallucinating about this!


UPDATE corpus

SET line_id = 

    CASE 

        WHEN play_name = 'Alls well that ends well' THEN 'AWW/' || line_number

        WHEN play_name = 'Antony and Cleopatra' THEN 'A&C/' || line_number

        WHEN play_name = 'As you like it' THEN 'AYLI/' || line_number

        WHEN play_name = 'A Comedy of Errors' THEN 'CE/' || line_number

        WHEN play_name = 'Coriolanus' THEN 'COR/' || line_number

        WHEN play_name = 'Cymbeline' THEN 'CYM/' || line_number

        WHEN play_name = 'Hamlet' THEN 'HAM/' || line_number

        WHEN play_name = 'Henry IV' THEN 'H4/' || line_number

        WHEN play_name = 'Henry VI Part 1' THEN '1H6/' || line_number

        WHEN play_name = 'Henry VI Part 2' THEN '2H6/' || line_number

        WHEN play_name = 'Henry VI Part 3' THEN '3H6/' || line_number

        WHEN play_name = 'Henry V' THEN 'H5/' || line_number

        WHEN play_name = 'Henry VIII' THEN 'H8/' || line_number

        WHEN play_name = 'Julius Caesar' THEN 'JC/' || line_number

        WHEN play_name = 'King John' THEN 'KJ/' || line_number

        WHEN play_name = 'King Lear' THEN 'KL/' || line_number

        WHEN play_name = 'Loves Labours Lost' THEN 'LLL/' || line_number

        WHEN play_name = 'Macbeth' THEN 'MAC/' || line_number

        WHEN play_name = 'Measure for measure' THEN 'MM/' || line_number

        WHEN play_name = 'Merchant of Venice' THEN 'MV/' || line_number

        WHEN play_name = 'Merry Wives of Windsor' THEN 'MWW/' || line_number

        WHEN play_name = 'A Midsummer nights dream' THEN 'MSND/' || line_number

        WHEN play_name = 'Much Ado about nothing' THEN 'MAAN/' || line_number

        WHEN play_name = 'Othello' THEN 'OTH/' || line_number

        WHEN play_name = 'Pericles' THEN 'PER/' || line_number

        WHEN play_name = 'Richard II' THEN 'R2/' || line_number

        WHEN play_name = 'Richard III' THEN 'R3/' || line_number

        WHEN play_name = 'Romeo and Juliet' THEN 'R&J/' || line_number

        WHEN play_name = 'Taming of the Shrew' THEN 'TOTS/' || line_number

        WHEN play_name = 'The Tempest' THEN 'TEMP/' || line_number

        ELSE 'unknown/' || line_number

    END;


-- Finally, save the data in whatever format you like.

-- Or just use it in-memory!


copy corpus TO 'shakespeare.csv' (header, delimiter '|');

copy corpus TO 'shakespeare.parquet' (format parquet, compression zstd);

mh duckdb_tools/full-text-shakespeare% 

No comments:

Post a Comment

Setting up DuckDB/Rust

Here's what I did to get DuckDB/rust going on my system. Prerequisites We need two things from DuckDB: header files (so we can build duc...