db = database()core
Introduction
We often have to go through a whole bunch of hoops to get documents processed and ready for searching through them. litesearch plans to make this as easy as possible by providing simple building blocks to set up a database with FTS5 and vector search capabilities.
/opt/hostedtoolcache/Python/3.12.13/x64/lib/python3.12/site-packages/usearch/__init__.py:131: UserWarning: Will download `usearch_sqlite` binary from GitHub.
warnings.warn("Will download `usearch_sqlite` binary from GitHub.", UserWarning)
Database.query
def query(
sql:str, params:Union=None
)->Generator:
Execute a query and return results as a list of AttrDict
Simple Docs table setup
Database.get_store
def get_store(
name:str='store', # table name
hash:bool=False, # whether to create hash index on content
kw:VAR_KEYWORD
):
Convenience for simple RAG: creates a content/embedding/metadata table with FTS5. For custom schemas, use vec_search and rrf_merge directly.
Table.vec_search
def vec_search(
emb:bytes, # query embedding vector (as bytes)
columns:list=None, # columns to return (None = all)
where:str=None, # additional WHERE clause
where_args:dict=None, # args for WHERE clause
emb_col:str='embedding', # name of the embedding column
emb_metric:str='cosine', # distance metric
dtype:type=float16, # embedding dtype
limit:int | None=50, # max results
offset:int | None=None, # offset for pagination
)->list:
Vector similarity search on any table with an embedding column.
rrf_merge
def rrf_merge(
fts_results, vec_results, k:int=60, limit:int=50, id_key:str='rowid'
)->list:
Reciprocal Rank Fusion: merges FTS and vector results. Items in both lists score highest.
database
def database(
pth_or_uri:str=':memory:', # the database name or URL
wal:bool=True, # use WAL mode
sem_search:bool=True, # enable usearch extensions
kw:VAR_KEYWORD
)->Database: # additional args to pass to apswutils database
Set up a database connection and load usearch extensions.
Database.search
def search(
q:str, # query string
emb:bytes, # embedding vector
columns:list=None, # columns to return
where:str=None, # additional where clause
where_args:dict=None, # args for where clause
limit:int | None=50, # limit on number of results
offset:int | None=None, # offset for results (ignored when rrf=True)
table_name:str='store', # table name
emb_col:str='embedding', # embedding column name
emb_metric:str='cosine', # embedding distance metric (cosine,sqeuclidean,inner,divergence)
rrf:bool=True, # rerank results with reciprocal rank fusion
rrf_k:int=60, # RRF k parameter
dtype:type=float16, # embedding dtype
id_key:str='rowid', # key to join RRF results on
quote:bool=False, # quote FTS query to disable special chars
):
Search the litesearch store with fts and vector search combined.
The fastlite database is set up with usearch extensions. Let’s run some distance calculations.
embs = dict(
v1=np.ones((100,),dtype=np.float32).tobytes(), # vector of ones
v2=np.zeros((100,),dtype=np.float32).tobytes(), # vector of zeros
v3=np.full((100,),0.25,dtype=np.float32).tobytes() # vector of 0.25s
)
def dist_q(metric):
return db.q(f'''
select
distance_{metric}_f32(:v1,:v2) as {metric}_v1_v2,
distance_{metric}_f32(:v1,:v3) as {metric}_v1_v3,
distance_{metric}_f32(:v2,:v3) as {metric}_v2_v3
''', embs)
for fn in ['sqeuclidean', 'divergence', 'inner', 'cosine']: print(dist_q(fn))[{'sqeuclidean_v1_v2': 100.0, 'sqeuclidean_v1_v3': 56.25, 'sqeuclidean_v2_v3': 6.25}]
[{'divergence_v1_v2': 34.657352447509766, 'divergence_v1_v3': 12.046551704406738, 'divergence_v2_v3': 8.66433334350586}]
[{'inner_v1_v2': 1.0, 'inner_v1_v3': -24.0, 'inner_v2_v3': 1.0}]
[{'cosine_v1_v2': 1.0, 'cosine_v1_v3': 0.0, 'cosine_v2_v3': 1.0}]
db.get_store()
if 'store' in db.t: print('store is created')
print('detected fts table: ',db.t.store.detect_fts())
print('Search results:', len(db.search('h',np.zeros((100,)).tobytes()))) # there is no data yet, so should be 0store is created
detected fts table: store_fts
Search results: 0
We can also create a store with hash index on content. Useful for code search applications
st=db.get_store(name='my_store', hash=True)
st.insert_all([dict(content='hello world', embedding=np.ones((100,),dtype=np.float16).tobytes()),
dict(content='hi there', embedding=np.full((100,),0.5,dtype=np.float16).tobytes()),
dict(content='goodbye now', embedding=np.zeros((100,),dtype=np.float16).tobytes())],upsert=True,hash_id='id')
st(select='id,content')[{'id': '250ce2bffa97ab21fa9ab2922d19993454a0cf28', 'content': 'hello world'},
{'id': 'c89f43361891bfab9290bcebf182fa5978f89700', 'content': 'hi there'},
{'id': '882293d5e5c3d3e04e8e0c4f7c01efba904d0932', 'content': 'goodbye now'}]
Let’s run a search again.
db.search(q='hello', emb=np.full((100,),0.25, dtype=np.float16).tobytes(), columns=['content'], table_name='my_store',limit=2, rrf=False, quote=True){'fts': [{'content': 'hello world', 'rank': -0.5108256237659907}],
'vec': [{'content': 'hello world', '_dist': 0.0},
{'content': 'hi there', '_dist': 0.0}]}
Now, let’s try the same but with a broader query.
db.search(q='goodbye OR hi', emb=np.full((100,),0,dtype=np.float16).tobytes(), columns=['content'], table_name='my_store',limit=2, quote=True)[{'rowid': 3,
'content': 'goodbye now',
'_dist': 0.0,
'_rrf_score': 0.016666666666666666},
{'rowid': 1,
'content': 'hello world',
'_dist': 1.0,
'_rrf_score': 0.01639344262295082}]
You can use different kind of embedding metrics as well. The default is cosine. Let’s try with divergence distance
db.search(q='goodbye OR hi', emb=np.full((100,),0,dtype=np.float16).tobytes(), columns=['content'], table_name='my_store',limit=2, emb_metric='divergence', quote=True)[{'rowid': 3,
'content': 'goodbye now',
'_dist': 0.0,
'_rrf_score': 0.016666666666666666},
{'rowid': 2,
'content': 'hi there',
'_dist': 17.328672409057617,
'_rrf_score': 0.01639344262295082}]