I learned a lot from Laurens Kuiper's blog post Testing out DuckDB's Full Text Search Extension, but unfortunately couldn't find the data set he was using for testing.
So, here's a simple "hello world" example using the easily accessible (and parsable!) kjv.txt. Note that everything past the table creation can be done in the duckdb cli.
% cat duckdb-full-text-search.py
import re
import duckdb
# -------- prepare the data -----------
fd = open('kjv.txt') # https://www.o-bible.com/download/kjv.txt
fd.readline() # skip first line
data = []
for line in fd.readlines():
line = line.rstrip()
# book, chap, verse, body = re.match(r'(\d?[A-Za-z]+)(\d+):(\d+)\s+(.*)', line).groups()
ref, body = re.match(r'(\d?[A-Za-z]+\d+:\d+)\s+(.*)', line).groups()
data.append((ref,body,))
# -------- create the table -----------
db = duckdb.connect()
db.cursor().execute("create table corpus(ref text, body text)")
db.cursor().executemany("insert into corpus(ref, body) values($1, $2)", data)
# -------- everything below could be run in the duckdb cli ---------
# -------- create the index -----------
db.cursor().execute(
"""
install 'fts';
load fts;
pragma create_fts_index('corpus', 'ref', 'ref', 'body');
""")
# -------- full text query -----------
print(db.sql("""
select fts_main_corpus.match_bm25(ref, 'whale') as score,
ref, body as "search for 'whale'"
from corpus
where score is not null
order by score;
"""))
% python3 ./duckdb-full-text-search.py
┌───────────────────┬──────────┬───────────────────────────────────────────────┐
│ score │ ref │ search for 'whale' │
│ double │ varchar │ varchar │
├───────────────────┼──────────┼───────────────────────────────────────────────┤
│ 2.7248255618541 │ Eze32:2 │ Son of man, take up a lamentation for Phara… │
│ 3.839526928067141 │ Ge1:21 │ And God created great whales, and every liv… │
│ 3.839526928067141 │ Mat12:40 │ For as Jonas was three days and three night… │
│ 6.497660955190547 │ Job7:12 │ Am I a sea, or a whale, that thou settest a… │
└───────────────────┴──────────┴───────────────────────────────────────────────┘
No comments:
Post a Comment