some utilities to aid data extraction and query preprocessing
We will build a simple ingestion pipeline to ingest pdf documents into litesearch database for searching.
Extensions to pdf_oxide PdfDocument to extract texts, images, links, tables, spans and outline
/opt/hostedtoolcache/Python/3.12.13/x64/lib/python3.12/site-packages/usearch/__init__.py:131: UserWarning: Will download `usearch_sqlite` binary from GitHub.
warnings.warn("Will download `usearch_sqlite` binary from GitHub.", UserWarning)
def chunk_markdown( text:str, # markdown text (e.g. from pdf_markdown()))->L:
Split markdown into paragraph chunks on blank lines; drop short fragments.
_pdf2 =next((p for p in ['pdfs/attention_is_all_you_need.pdf', 'nbs/pdfs/attention_is_all_you_need.pdf'] if Path(p).exists()), None)if _pdf2: _doc = PdfDocument(_pdf2) _chunks = _doc.pdf_chunks()assertlen(_chunks) >0assertall(len(t) ==3for t in _chunks), 'each chunk must be (page, chunk_idx, text)'assertall(len(t[2]) >=40for t in _chunks), 'short fragments should be filtered'print(f'pdf_chunks: {len(_chunks)} chunks from {_doc.page_count()} pages (sample: {repr(_chunks[0][2][:60])})...')else:print('Skipping chunk test — PDF not found')
/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
pdf_chunks: 30 chunks from 15 pages (sample: 'Provided proper attribution is provided, Google hereby grant')...
def file_parse( p:Path=None, # path to a code file code:str=None, # code string to parse imports:bool=False, # include import statements as code chunks assigns:bool=False, # include top-level assignments as code chunks)->L: # additional args to pass to pyparse
Parse a code file or code string and return code chunks with metadata.
def pyparse( p:Path=None, # path to a python file code:str=None, # code string to parse imports:bool=False, # include import statements as code chunks assigns:bool=False, # include top-level assignments as code chunks)->L:
Parse a code string or python file and return code chunks as list of dicts with content and metadata.
def pkg2files( pkg:str, # package name skip_file_re:str='^[.]|^(?:setup\\.py|conftest\\.py)$', # regex to skip files skip_folder_re:str='^[.]|^(?:tests?|examples?|docs?|build|dist)$', # regex to skip folders func:type=Path, # function to apply to file paths (e.g. Path or str) path:pathlib.Path |str='.', # path to start searching recursive:bool=True, # search subfolders maxdepth:int=None, # max depth to descend (1=just immediate contents; None=unlimited) symlinks:bool=True, # follow symlinks? file_glob:str=None, # Only include files matching glob file_re:str=None, # Only include files matching regex folder_re:str=None, # Only enter folders matching regex skip_file_glob:str=None, # Skip files matching glob ret_folders:bool=False, # return folders, not just files sort:bool=True, # sort files by name within each folder types:str|list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg exts:str|list=None, # list or comma-separated str of exts to include)->L: # additional args to pass to globtastic
Return list of python files in a package excluding tests and setup files.
def dir2files(dir:str, # package name skip_file_re:str='^[.]|^(?:setup\\.py|conftest\\.py)$', # regex to skip files skip_folder_re:str='^[.]|^(?:tests?|examples?|docs?|build|dist)$', # regex to skip folders func:type=Path, # function to apply to file paths (e.g. Path or str) path:pathlib.Path |str='.', # path to start searching recursive:bool=True, # search subfolders maxdepth:int=None, # max depth to descend (1=just immediate contents; None=unlimited) symlinks:bool=True, # follow symlinks? file_glob:str=None, # Only include files matching glob file_re:str=None, # Only include files matching regex folder_re:str=None, # Only enter folders matching regex skip_file_glob:str=None, # Skip files matching glob ret_folders:bool=False, # return folders, not just files sort:bool=True, # sort files by name within each folder types:str|list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg exts:str|list=None, # list or comma-separated str of exts to include)->L: # additional args to pass to globtastic
Return list of python files in a package excluding tests and setup files.
def dir2chunks(dir:str, # directory path imports:bool=False, # include import statements as code chunks assigns:bool=False, # include top-level assignments as code chunks skip_file_re:str='^[.]|^(?:setup\\.py|conftest\\.py)$', # regex to skip files skip_folder_re:str='^[.]|^(?:tests?|examples?|docs?|build|dist)$', # regex to skip folders func:type=Path, # function to apply to file paths (e.g. Path or str) path:pathlib.Path |str='.', # path to start searching recursive:bool=True, # search subfolders maxdepth:int=None, # max depth to descend (1=just immediate contents; None=unlimited) symlinks:bool=True, # follow symlinks? file_glob:str=None, # Only include files matching glob file_re:str=None, # Only include files matching regex folder_re:str=None, # Only enter folders matching regex skip_file_glob:str=None, # Skip files matching glob ret_folders:bool=False, # return folders, not just files sort:bool=True, # sort files by name within each folder types:str|list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg exts:str|list=None, # list or comma-separated str of exts to include)->L: # additional args to pass to dir2files
Return code chunks from a directory with extra metadata.
def pkg2chunks( pkg:str, # package name imports:bool=False, # include import statements as code chunks kw:VAR_KEYWORD)->L: # additional args to pass to pkg2files
Return code chunks from a package with extra metadata.
pkg2chunks can be used to extract code chunks from an entire package installed in your environment.
def installed_packages( nms:list=None, # list of package names pyproject:bool=False, # restrict to pyproject.toml dependencies xtras:str|None=None, # include extra groups as csv from pyproject.toml)->L:
Return list of installed packages. If nms is provided, return only those packages.
Get list of installed packages in your environment using installed_packages. If you pass a list of package names, it only returns them if they exist in your environment.
installed_packages(['fstlite'], pyproject=True, xtras='foo') # non existent package
def pre( q:str, # query to be passed for fts search wc:bool=True, # add wild card to each word wide:bool=True, # widen the query with OR operator extract_kw:bool=True, # extract keywords from the query pattern:str='[*,"\\(\\)\\^_]|-(?=\\S)', # regex pattern to use to replace with space):
def clean( q:str, # query to be passed for fts search pattern:str='[*,"\\(\\)\\^_]|-(?=\\S)', # regex pattern to use to replace with space):
Clean the query by removing and returning None for empty queries.*
You can clean queries passed into fts search using clean, add wild cards using add_wc, widen the query using mk_wider and extract keywords using kw. You can combine all these using pre function.
q ='This is a sample query'print('preprocessed q with defaults: `%s`'%pre(q))print('keywords extracted: `%s`'%pre(q, wc=False, wide=False))print('q with wild card: `%s`'%pre(q, extract_kw=False, wide=False, wc=True))print('remove _ : `%s`'%pre('This_is_a_query_with_underscores with_uv', extract_kw=False))
img2png normalises any image input (PIL, bytes, or path) to RGB PNG bytes. png_det parses the PNG header and IDAT stream. images_to_pdf wraps a list of images into a conformant multi-page image-only PDF without any external dependencies.
Note: pdf_oxide.Pdf.from_image* methods are broken in 0.3.x (produce an empty 1.3 KB shell). These functions build a conformant PDF directly from the PNG spec using FlateDecode + Predictor 15.
Normalise image input (PIL Image / bytes / path) to PNG bytes as RGB.
import tempfile, os
# find PDF relative to nbs/ (normal Jupyter) or project root (nbdev_test)_pdf =next((p for p in ['pdfs/attention_is_all_you_need.pdf', 'nbs/pdfs/attention_is_all_you_need.pdf'] if Path(p).exists()), None)assert _pdf, 'attention_is_all_you_need.pdf not found'arxiv = PdfDocument(_pdf)# pdf_texts and pdf_markdownmd_pages = arxiv.pdf_markdown()assertlen(md_pages) == arxiv.page_count(), f'Expected {arxiv.page_count()} pages, got {len(md_pages)}'assertany('attention'in p.lower() for p in md_pages)txt_pages = arxiv.pdf_texts()assertlen(txt_pages) == arxiv.page_count()assertany('attention'in t.lower() for t in txt_pages)links = arxiv.pdf_links()assertisinstance(links, L)print(f'pdf_texts: {len(txt_pages)} pages, pdf_markdown: {len(md_pages)} pages, pdf_links: {len(links)} links')# images_to_pdf: build synthetic scanned PDF from a real page imagewith tempfile.TemporaryDirectory() as tmp: arxiv.to_markdown(2, include_images=True, embed_images=False, image_output_dir=tmp) img_bytes =open(os.path.join(tmp, os.listdir(tmp)[0]), 'rb').read()_scanned = Path(_pdf).parent/'scanned_test.pdf'images_to_pdf([img_bytes], output=str(_scanned))scanned = PdfDocument(str(_scanned))assert scanned.extract_text(0).strip() =='', 'scanned PDF should have no native text'assert _scanned.stat().st_size >100_000, 'scanned PDF should contain the embedded image'print('images_to_pdf test passed:', _scanned)
Copy bundled SKILL.md to skill directories (.agents/, .claude/, .Codex/).
import tempfile, importlib.util as _iufrom pathlib import Path as _Path# Locate the installed litesearch package — works regardless of cwd_spec = _iu.find_spec('litesearch')skill_src = _Path(_spec.submodule_search_locations[0]) /'SKILL.md'assert skill_src.exists(), f'SKILL.md not found at {skill_src.resolve()}'with tempfile.TemporaryDirectory() as tmp: tmp = _Path(tmp)for d in ['.agents', '.claude', '.Codex']: dest = tmp / d /'skills'/'litesearch'/'SKILL.md' dest.parent.mkdir(parents=True, exist_ok=True) dest.write_text(skill_src.read_text())for d in ['.agents', '.claude', '.Codex']: dest = tmp / d /'skills'/'litesearch'/'SKILL.md'assert dest.exists(), f'Missing: {dest}'assert'litesearch'in dest.read_text()print('mv_skill_md install paths ok')