utils

encoding utilities for onnx based text encoders

Let’s load some default models that work well off the box for various tasks

FastEncode is an onnx based embedding model wrapper that can work with most onnx model with a huggingface tokenizer. (The Qwen models are a bit tricky due to their padding token handling so they need a custom wrapper which we will add later)

/opt/hostedtoolcache/Python/3.12.13/x64/lib/python3.12/site-packages/usearch/__init__.py:131: UserWarning: Will download `usearch_sqlite` binary from GitHub.
  warnings.warn("Will download `usearch_sqlite` binary from GitHub.", UserWarning)

source

download_model


def download_model(
    repo_id:str='onnx-community/embeddinggemma-300m-ONNX', # HF repo id
    md:str='onnx-community/embeddinggemma-300m-ONNX', # local cache dir
    filename:NoneType=None, # file path within repo; if None, downloads full repo snapshot
    token:NoneType=None, # HF token. you can also set HF_TOKEN env variable
):

Download model (or single file) from HF hub. Returns local path. Skips download if already cached.


source

FastEncode


def FastEncode(
    model_dict:AttrDict={'model': 'onnx-community/embeddinggemma-300m-ONNX', 'onnx_path': 'onnx/model.onnx', 'prompt': {'document': 'Instruct: document \n document: {text}', 'query': 'Instruct: query \n query: {text}'}}, # model dict with model repo, onnx path and prompt templates
    repo_id:NoneType=None, # model repo on HF. needs to have onnx model file
    md:NoneType=None, # local model dir
    md_nm:NoneType=None, # onnx model file name
    normalize:bool=True, # normalize embeddings
    dtype:type=float16, # output dtype
    tti:bool=False, # use token type ids (overridden by model_dict.tti if set)
    prompt:NoneType=None, # prompt templates
    hf_token:NoneType=None, # HF token. you can also set HF_TOKEN env variable
    batch_size:int=32, # texts per ONNX call
    parallel:int=0, # thread workers for parallel encoding (0=single-threaded)
    quantize:NoneType=None, # quantize weights: None, 'int8', or 'uint8'
    max_seq_len:NoneType=None, # override tokenizer max length (e.g. 64 for SigLIP2)
):

Fast ONNX-based text encoder with batching, parallel execution, and quantization.

Image Encoding

FastEncodeImage is a CLIP-style image encoder using the same ONNX approach as FastEncode. It accepts PIL Images, file paths, or bytes, and shares the same batch_size, parallel, and stream API.


source

FastEncodeImage


def FastEncodeImage(
    model_dict:AttrDict={'model': 'nomic-ai/nomic-embed-vision-v1.5', 'onnx_path': 'onnx/model.onnx', 'img_size': 224, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]}, # model dict with model repo and onnx path
    normalize:bool=True, # normalize embeddings
    dtype:type=float16, # output dtype
    batch_size:int=16, # images per ONNX call
    parallel:int=0, # thread workers for parallel encoding (0=single-threaded)
    hf_token:NoneType=None, # HF token. you can also set HF_TOKEN env variable
):

Fast ONNX-based image encoder for CLIP-style models.

Unified Multimodal Encoding

FastEncodeMultimodal wraps a single model repo that ships both a text and a vision ONNX encoder (e.g. SigLIP2). Both encoders share the same embedding space, so text and image embeddings can be compared directly.

For paired models like nomic_text_v15 + nomic_vision_v15, use FastEncode and FastEncodeImage separately — they share the same 768-dim space and are drop-in replacements for each other.


source

FastEncodeMultimodal


def FastEncodeMultimodal(
    model_dict:AttrDict={'model': 'onnx-community/siglip2-so400m-patch16-512-ONNX', 'vision_onnx': 'onnx/vision_model.onnx', 'text_onnx': 'onnx/text_model.onnx', 'img_size': 512, 'mean': [0.5, 0.5, 0.5], 'std': [0.5, 0.5, 0.5], 'max_seq_len': 64}, # model dict with vision_onnx and text_onnx paths
    normalize:bool=True, # normalize embeddings
    dtype:type=float16, # output dtype
    batch_size_text:int=32, # texts per ONNX call
    batch_size_image:int=16, # images per ONNX call
    parallel:int=0, # thread workers (0=single-threaded)
    hf_token:NoneType=None, # HF token. you can also set HF_TOKEN env variable
):

Unified ONNX encoder for models with both text and vision encoders in a single repo (e.g. SigLIP2).

from PIL import Image
import base64, io, os
from IPython.display import display
# Create 4 synthetic RGB images without needing any files
test_imgs = [Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)) for _ in range(4)]
img_enc = FastEncodeImage()  # default: nomic_vision_v15 (768-dim)

# Basic embed
embs = img_enc.embed(test_imgs)
assert embs.shape == (4, 768), f'Expected (4, 768), got {embs.shape}'
assert embs.dtype == np.float16
print(f'Basic image embed: shape={embs.shape}, dtype={embs.dtype}')

# Batching — batch_size=2 vs batch_size=4 must match
embs_b2 = img_enc.embed(test_imgs, batch_size=2)
embs_b4 = img_enc.embed(test_imgs, batch_size=4)
assert embs_b2.shape == embs_b4.shape == (4, 768)
assert np.allclose(embs_b2.astype(np.float32), embs_b4.astype(np.float32), atol=1e-3), 'image batching results mismatch'
print('Image batching test passed')

# Stream
stream_res = img_enc.embed(test_imgs, batch_size=2, stream=True)
assert hasattr(stream_res, '__iter__')
assert np.concatenate(list(stream_res)).shape == (4, 768)
print('Image stream test passed')

# Parallel
embs_par = img_enc.embed(test_imgs, batch_size=2, parallel=2)
assert np.allclose(embs_b2.astype(np.float32), embs_par.astype(np.float32), atol=1e-3), 'image parallel results mismatch'
print('Image parallel test passed')
Basic image embed: shape=(4, 768), dtype=float16
Image batching test passed
Image stream test passed
Image parallel test passed
# nomic paired — text (FastEncode) + vision (FastEncodeImage) share the same 768-dim space
nomic_enc     = FastEncode(nomic_text_v15)
nomic_img_enc = FastEncodeImage(nomic_vision_v15)

t_emb = nomic_enc.encode(['What does the Eiffel Tower look like?'])
i_emb = nomic_img_enc.embed([Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))])

assert t_emb.shape == (1, 768), f'Expected (1, 768), got {t_emb.shape}'
assert i_emb.shape == (1, 768), f'Expected (1, 768), got {i_emb.shape}'
assert t_emb.dtype == i_emb.dtype == np.float16
print(f'nomic paired: text={t_emb.shape}, image={i_emb.shape} — same space, ready for cross-modal search')
nomic paired: text=(1, 768), image=(1, 768) — same space, ready for cross-modal search

PDF RAG Helpers

encode_pdf_texts and encode_pdf_images are the stable primitives — they encode and yield raw (page, chunk_idx, text, emb) / (page, img_bytes, emb) tuples; the caller owns the storage schema entirely.


source

encode_pdf_images


def encode_pdf_images(
    doc, # PdfDocument
    enc, # FastEncodeImage instance
)->L:

Extract and encode every PDF image; yields (page, img_bytes, emb) — caller owns the storage schema.


source

encode_pdf_texts


def encode_pdf_texts(
    doc, # PdfDocument
    enc, # FastEncode instance
    chunk_fn:NoneType=None, # doc -> [(pg,ci,text)]; default: doc.pdf_chunks
    kw:VAR_KEYWORD
)->L: # forwarded to doc.pdf_chunks (min_len, st, end)

Encode PDF text chunks; yields (page, chunk_idx, text, emb) — caller owns the storage schema.

enc = FastEncode()
embs = enc.encode_document(['This is a test', 'Another test'])
assert embs.shape == (2, 768), f'Expected (2, 768), got {embs.shape}'
assert embs.dtype == np.float16
print(f'Basic encode: shape={embs.shape}, dtype={embs.dtype}')
embs
Basic encode: shape=(2, 768), dtype=float16
array([[ 0.05777 ,  0.001723,  0.002573, ..., -0.0618  , -0.00662 ,
         0.03174 ],
       [ 0.02936 , -0.00818 , -0.00916 , ..., -0.02847 , -0.00226 ,
         0.02846 ]], shape=(2, 768), dtype=float16)
qg=embedding_gemma
qg['onnx_path'] = 'onnx/model_q4.onnx'
enc_q = FastEncode()
embs = enc.encode_document(['This is a test', 'Another test'])
assert embs.shape == (2, 768), f'Expected (2, 768), got {embs.shape}'
assert embs.dtype == np.float16
print(f'Basic encode: shape={embs.shape}, dtype={embs.dtype}')
embs
Basic encode: shape=(2, 768), dtype=float16
array([[ 0.05777 ,  0.001723,  0.002573, ..., -0.0618  , -0.00662 ,
         0.03174 ],
       [ 0.02936 , -0.00818 , -0.00916 , ..., -0.02847 , -0.00226 ,
         0.02846 ]], shape=(2, 768), dtype=float16)
# Batching — 30 texts in batch_size=8 vs batch_size=100 must produce identical results
texts = [f'sample text number {i}' for i in range(30)]
embs_b8   = enc.encode_document(texts, batch_size=8)
embs_b100 = enc.encode_document(texts, batch_size=100)
assert embs_b8.shape == embs_b100.shape == (30, 768)
assert np.allclose(embs_b8.astype(np.float32), embs_b100.astype(np.float32), atol=1e-3), 'batching results mismatch'
print(f'Batching test passed: {embs_b8.shape}')
Batching test passed: (30, 768)
# Stream — stream=True returns an iterable of batch arrays rather than one concatenated array
stream_results = enc.encode_document(texts, batch_size=10, stream=True)
assert hasattr(stream_results, '__iter__'), 'stream should be iterable'
concatenated = np.concatenate(list(stream_results))
assert concatenated.shape == (30, 768)
print('Stream test passed')

# Parallel — 4 thread workers should give same embeddings as single-threaded
embs_par = enc.encode_document(texts, batch_size=8, parallel=4)
assert np.allclose(embs_b8.astype(np.float32), embs_par.astype(np.float32), atol=1e-3), 'parallel results mismatch'
print('Parallel test passed')
Stream test passed
Parallel test passed
# Semantic QA — modernbert should rank the transformer/attention passage highest
modern_enc = FastEncode(modernbert)
passages = [
    'The Transformer is a model architecture using self-attention mechanisms.',
    'Recurrent neural networks process sequences step by step.',
    'The model uses encoder and decoder stacks connected by attention.',
    'Convolutional networks excel at image recognition tasks.',
]
doc_embs = modern_enc.encode_document(passages).astype(np.float32)
query = 'What architecture replaces recurrence with self-attention?'
q_emb = modern_enc.encode_query([query]).astype(np.float32)
sims = (doc_embs @ q_emb.T).flatten()
best = int(sims.argmax())
assert sims[best] > 0.3, f'Expected similarity > 0.3, got {sims[best]:.3f}'
assert any(w in passages[best].lower() for w in ['attention', 'transformer', 'encoder'])
print(f'Semantic QA test passed: best match sim={sims[best]:.3f}')
print(f'Best: {passages[best]}')
Semantic QA test passed: best match sim=0.527
Best: The Transformer is a model architecture using self-attention mechanisms.