data

some utilities to aid data extraction and query preprocessing

We will build a simple ingestion pipeline to ingest pdf documents into litesearch database for searching.

Extensions to pdf_oxide PdfDocument to extract texts, images, links, tables, spans and outline

/opt/hostedtoolcache/Python/3.12.13/x64/lib/python3.12/site-packages/usearch/__init__.py:131: UserWarning: Will download `usearch_sqlite` binary from GitHub.
  warnings.warn("Will download `usearch_sqlite` binary from GitHub.", UserWarning)

source

PdfDocument.pdf_spans


def pdf_spans(
    doc:PdfDocument, st:int=0, end:NoneType=None
)->L:

Extract text spans with font metadata (size, weight, bbox) per page.


source

PdfDocument.pdf_tables


def pdf_tables(
    doc:PdfDocument, st:int=0, end:NoneType=None
)->L:

Extract structured tables (rows/cells/bbox) from each page.


source

PdfDocument.pdf_markdown


def pdf_markdown(
    doc:PdfDocument, st:int=0, end:NoneType=None
)->L:

Convert pages to markdown with headings and tables detected.


source

PdfDocument.pdf_images


def pdf_images(
    doc:PdfDocument, st:int=0, end:NoneType=None, output_dir:NoneType=None
)->L:

Extract images — returns metadata dicts, or saves to output_dir if provided.


source

PdfDocument.pdf_texts


def pdf_texts(
    doc:PdfDocument, st:int=0, end:NoneType=None
)->L:

Extract plain text from each page.


source

PdfDocument.pdf_chunks


def pdf_chunks(
    st:int=0, # start page
    end:NoneType=None, # end page (exclusive)
)->L:

Markdown-chunked text from each page; yields (page, chunk_idx, text) tuples.


source

chunk_markdown


def chunk_markdown(
    text:str, # markdown text (e.g. from pdf_markdown())
)->L:

Split markdown into paragraph chunks on blank lines; drop short fragments.

_pdf2 = next((p for p in ['pdfs/attention_is_all_you_need.pdf', 'nbs/pdfs/attention_is_all_you_need.pdf'] if Path(p).exists()), None)
if _pdf2:
    _doc = PdfDocument(_pdf2)
    _chunks = _doc.pdf_chunks()
    assert len(_chunks) > 0
    assert all(len(t) == 3 for t in _chunks), 'each chunk must be (page, chunk_idx, text)'
    assert all(len(t[2]) >= 40 for t in _chunks), 'short fragments should be filtered'
    print(f'pdf_chunks: {len(_chunks)} chunks from {_doc.page_count()} pages (sample: {repr(_chunks[0][2][:60])})...')
else:
    print('Skipping chunk test — PDF not found')
/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
pdf_chunks: 30 chunks from 15 pages (sample: 'Provided proper attribution is provided, Google hereby grant')...

Code extraction utilities


source

file_parse


def file_parse(
    p:Path=None, # path to a code file
    code:str=None, # code string to parse
    imports:bool=False, # include import statements as code chunks
    assigns:bool=False, # include top-level assignments as code chunks
)->L: # additional args to pass to pyparse

Parse a code file or code string and return code chunks with metadata.


source

chunk_texts


def chunk_texts(
    text:str
):

Chunk texts using Fast Chunker


source

non_py_sigs


def non_py_sigs(
    p
):

Signatures for paths that are not python code files using codesigs


source

ipynb_parse


def ipynb_parse(
    p
):

Parse ipynb files


source

pyparse


def pyparse(
    p:Path=None, # path to a python file
    code:str=None, # code string to parse
    imports:bool=False, # include import statements as code chunks
    assigns:bool=False, # include top-level assignments as code chunks
)->L:

Parse a code string or python file and return code chunks as list of dicts with content and metadata.


source

spec


def spec(
    pkg:str
)->_frozen_importlib.ModuleSpec | None:

Return the importlib ModuleSpec for a package, or None if not found.


source

repo_root


def repo_root(
    
)->Path:

Find the root of the current git repository, or None if not in a repo.

ipynb_parse(Path('01_core.ipynb'))[0]
{'content': '---\ndescription: Building blocks for litesearch\noutput-file: core.html\ntitle: core\n\n---\n\n',
 'metadata': {'path': '01_core.ipynb',
  'uploaded_at': 1777421728.1014895,
  'lang': '.ipynb',
  'type': 'raw'}}

You can use pyparse to extract code chunks from a python file or code string.

txt = """
from fastcore.all import *
a=1
class SomeClass:
    def __init__(self,x): store_attr()
    def method(self): return self.x + a
 """
pyparse(code=txt)
[{'content': 'class SomeClass:\n    def __init__(self,x): store_attr()\n    def method(self): return self.x + a', 'metadata': {'path': 'None', 'uploaded_at': None, 'name': 'SomeClass', 'lang': '.py', 'type': 'ClassDef', 'lineno': 4, 'end_lineno': 6}}]

Setting imports to True will also include import statements as code chunks.

pyparse(code=txt, imports=True)
[{'content': 'from fastcore.all import *', 'metadata': {'path': 'None', 'uploaded_at': None, 'name': None, 'lang': '.py', 'type': 'ImportFrom', 'lineno': 2, 'end_lineno': 2}}, {'content': 'class SomeClass:\n    def __init__(self,x): store_attr()\n    def method(self): return self.x + a', 'metadata': {'path': 'None', 'uploaded_at': None, 'name': 'SomeClass', 'lang': '.py', 'type': 'ClassDef', 'lineno': 4, 'end_lineno': 6}}]
pyparse(code=txt, assigns=True)
[{'content': 'a=1', 'metadata': {'path': 'None', 'uploaded_at': None, 'name': None, 'lang': '.py', 'type': 'Assign', 'lineno': 3, 'end_lineno': 3}}, {'content': 'class SomeClass:\n    def __init__(self,x): store_attr()\n    def method(self): return self.x + a', 'metadata': {'path': 'None', 'uploaded_at': None, 'name': 'SomeClass', 'lang': '.py', 'type': 'ClassDef', 'lineno': 4, 'end_lineno': 6}}]
def parse_js():
    from textwrap import dedent
    from tempfile import NamedTemporaryFile as ntf

    js_code = dedent("""
        const htmlElement = document.documentElement;
        const mediaQuery = window.matchMedia('(prefers-color-scheme: dark)');
        const vr = '__VR__';
        let __VR__ = JSON.parse(localStorage.getItem(vr) || '{{__state__}}');
        function storeState(key, value) {__VR__[key] = value; localStorage.setItem(vr, JSON.stringify(__VR__));}
        function getState(key){return __VR__[key];}
        function removeState(key) {delete  __VR__[key]; localStorage.setItem(key, JSON.stringify(__VR__)); }
        function setTheme(color,fn=null, ...args) {
            if (color === null || color === undefined) {return;}
            htmlElement.classList.remove(__VR__.theme);
            htmlElement.classList.add(color);
            storeState('theme', color);
            if (typeof fn === 'function') {fn(...args);}
        }
    """)

    with ntf('w+', suffix='.js', encoding='utf-8') as f:
        f.write(js_code)
        f.flush()  # Ensure content is readable by file_parse from disk
        print(file_parse(Path(f.name)))

parse_js()
[{'content': 'function storeState(key, value) {...}', 'metadata': {'path': '/tmp/tmp2ov5wkwr.js', 'uploaded_at': 1778203522.4120872, 'lang': '.js'}}, {'content': 'function getState(key) {...}', 'metadata': {'path': '/tmp/tmp2ov5wkwr.js', 'uploaded_at': 1778203522.4120872, 'lang': '.js'}}, {'content': 'function removeState(key) {...}', 'metadata': {'path': '/tmp/tmp2ov5wkwr.js', 'uploaded_at': 1778203522.4120872, 'lang': '.js'}}, {'content': 'function setTheme(color,fn=null, ...args) {...}', 'metadata': {'path': '/tmp/tmp2ov5wkwr.js', 'uploaded_at': 1778203522.4120872, 'lang': '.js'}}]

source

pkg2files


def pkg2files(
    pkg:str, # package name
    skip_file_re:str='^[.]|^(?:setup\\.py|conftest\\.py)$', # regex to skip files
    skip_folder_re:str='^[.]|^(?:tests?|examples?|docs?|build|dist)$', # regex to skip folders
    func:type=Path, # function to apply to file paths (e.g. Path or str)
    path:pathlib.Path | str='.', # path to start searching
    recursive:bool=True, # search subfolders
    maxdepth:int=None, # max depth to descend (1=just immediate contents; None=unlimited)
    symlinks:bool=True, # follow symlinks?
    file_glob:str=None, # Only include files matching glob
    file_re:str=None, # Only include files matching regex
    folder_re:str=None, # Only enter folders matching regex
    skip_file_glob:str=None, # Skip files matching glob
    ret_folders:bool=False, # return folders, not just files
    sort:bool=True, # sort files by name within each folder
    types:str | list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
    exts:str | list=None, # list or comma-separated str of exts to include
)->L: # additional args to pass to globtastic

Return list of python files in a package excluding tests and setup files.

pkg2files('fastlite', exts=code_exts)
[Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/fastlite/__init__.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/fastlite/_modidx.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/fastlite/core.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/fastlite/kw.py')]
pkg2files('protobuf')
[Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/_upb/_message.abi3.so'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/__init__.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/any.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/any_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/api_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/compiler/__init__.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/compiler/plugin_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/descriptor.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/descriptor_database.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/descriptor_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/descriptor_pool.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/duration.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/duration_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/empty_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/field_mask_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/__init__.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/api_implementation.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/builder.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/containers.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/decoder.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/encoder.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/enum_type_wrapper.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/extension_dict.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/field_mask.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/message_listener.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/python_edition_defaults.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/python_message.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/testing_refleaks.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/type_checkers.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/well_known_types.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/internal/wire_format.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/json_format.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/message.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/message_factory.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/proto.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/proto_builder.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/proto_json.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/proto_text.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/pyext/__init__.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/pyext/cpp_message.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/reflection.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/runtime_version.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/service_reflection.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/source_context_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/struct_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/symbol_database.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/testdata/__init__.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/text_encoding.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/text_format.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/timestamp.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/timestamp_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/type_pb2.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/unknown_fields.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/util/__init__.py'), Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/google/protobuf/wrappers_pb2.py')]

source

dir2files


def dir2files(
    dir:str, # package name
    skip_file_re:str='^[.]|^(?:setup\\.py|conftest\\.py)$', # regex to skip files
    skip_folder_re:str='^[.]|^(?:tests?|examples?|docs?|build|dist)$', # regex to skip folders
    func:type=Path, # function to apply to file paths (e.g. Path or str)
    path:pathlib.Path | str='.', # path to start searching
    recursive:bool=True, # search subfolders
    maxdepth:int=None, # max depth to descend (1=just immediate contents; None=unlimited)
    symlinks:bool=True, # follow symlinks?
    file_glob:str=None, # Only include files matching glob
    file_re:str=None, # Only include files matching regex
    folder_re:str=None, # Only enter folders matching regex
    skip_file_glob:str=None, # Skip files matching glob
    ret_folders:bool=False, # return folders, not just files
    sort:bool=True, # sort files by name within each folder
    types:str | list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
    exts:str | list=None, # list or comma-separated str of exts to include
)->L: # additional args to pass to globtastic

Return list of python files in a package excluding tests and setup files.

dir2files(repo_root(), exts=code_exts.split(','), skip_file_glob='*.ipynb')
[Path('/Users/71293/code/litesearch/litesearch/__init__.py'), Path('/Users/71293/code/litesearch/litesearch/_modidx.py'), Path('/Users/71293/code/litesearch/litesearch/core.py'), Path('/Users/71293/code/litesearch/litesearch/data.py'), Path('/Users/71293/code/litesearch/litesearch/postfix.py'), Path('/Users/71293/code/litesearch/litesearch/utils.py')]

source

dir2chunks


def dir2chunks(
    dir:str, # directory path
    imports:bool=False, # include import statements as code chunks
    assigns:bool=False, # include top-level assignments as code chunks
    skip_file_re:str='^[.]|^(?:setup\\.py|conftest\\.py)$', # regex to skip files
    skip_folder_re:str='^[.]|^(?:tests?|examples?|docs?|build|dist)$', # regex to skip folders
    func:type=Path, # function to apply to file paths (e.g. Path or str)
    path:pathlib.Path | str='.', # path to start searching
    recursive:bool=True, # search subfolders
    maxdepth:int=None, # max depth to descend (1=just immediate contents; None=unlimited)
    symlinks:bool=True, # follow symlinks?
    file_glob:str=None, # Only include files matching glob
    file_re:str=None, # Only include files matching regex
    folder_re:str=None, # Only enter folders matching regex
    skip_file_glob:str=None, # Skip files matching glob
    ret_folders:bool=False, # return folders, not just files
    sort:bool=True, # sort files by name within each folder
    types:str | list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
    exts:str | list=None, # list or comma-separated str of exts to include
)->L: # additional args to pass to dir2files

Return code chunks from a directory with extra metadata.


source

pkg2chunks


def pkg2chunks(
    pkg:str, # package name
    imports:bool=False, # include import statements as code chunks
    kw:VAR_KEYWORD
)->L: # additional args to pass to pkg2files

Return code chunks from a package with extra metadata.

pkg2chunks can be used to extract code chunks from an entire package installed in your environment.

pkg2chunks('fastlite').filter(lambda d: d['metadata']['type']=='FunctionDef')[0]
{'content': 'def t(self:Database): return _TablesGetter(self)',
 'metadata': {'path': '/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/fastlite/core.py',
  'uploaded_at': 1773452878.5692947,
  'name': 't',
  'lang': '.py',
  'type': 'FunctionDef',
  'lineno': 44,
  'end_lineno': 44,
  'package': 'fastlite',
  'version': '0.2.4'}}
pkg2files('pdf_oxide')[0]
Path('/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/pdf_oxide/__init__.py')
dir2chunks('../nbs', types='ipynb')[0]
{'content': '---\ndescription: Fixes for post import issues\noutput-file: postfix.html\ntitle: postfix\n\n---\n\n',
 'metadata': {'path': '../nbs/00_postfix.ipynb',
  'uploaded_at': 1773099020.1621501,
  'lang': '.ipynb',
  'type': 'raw',
  'dir': '../nbs'}}

source

installed_packages


def installed_packages(
    nms:list=None, # list of package names
    pyproject:bool=False, # restrict to pyproject.toml dependencies
    xtras:str | None=None, # include extra groups as csv from pyproject.toml
)->L:

Return list of installed packages. If nms is provided, return only those packages.

Get list of installed packages in your environment using installed_packages. If you pass a list of package names, it only returns them if they exist in your environment.

installed_packages(['fstlite'], pyproject=True, xtras='foo') # non existent package
['codesigs', 'notebook', 'chonkie', 'fastlite', 'pdf_oxide', 'usearch', 'pandas', 'onnxruntime', 'pillow', 'model2vec', 'yake', 'tokenizers']
installed_packages(['fastlite']) # existing package
['fastlite']
installed_packages()[:10] # all installed packages that are not stdlib
['apsw', 'threadpoolctl', 'apswutils', 'uri-template', 'requests', 'usearch', 'rfc3339-validator', 'pexpect', 'jupyterlab-quarto', 'fqdn']
installed_packages(pyproject=True) # all installed packages that are dependencies in pyproject.toml
['chonkie', 'codesigs', 'fastlite', 'model2vec', 'notebook', 'onnxruntime', 'pandas', 'pdf_oxide', 'pillow', 'tokenizers', 'usearch', 'yake', 'notebook', 'nbdev', 'jupyterlab', 'jupyterlab-quarto', 'selectolax', 'lisette', 'pyperclip', 'chonkie', 'FlashRank', 'model2vec', 'fastprogress', 'koshas']
installed_packages(pyproject=True, xtras='dev')
['chonkie', 'codesigs', 'fastlite', 'model2vec', 'notebook', 'onnxruntime', 'pandas', 'pdf_oxide', 'pillow', 'tokenizers', 'usearch', 'yake']

Query Preprocessing utilities


source

pre


def pre(
    q:str, # query to be passed for fts search
    wc:bool=True, # add wild card to each word
    wide:bool=True, # widen the query with OR operator
    extract_kw:bool=True, # extract keywords from the query
    pattern:str='[*,"\\(\\)\\^_]|-(?=\\S)', # regex pattern to use to replace with space
):

Preprocess the query for fts search.


source

kw


def kw(
    q:str, # query to be passed for fts search
):

Extract keywords from the query using YAKE library.


source

mk_wider


def mk_wider(
    q:str, # query to be passed for fts search
):

Widen the query by joining words with OR operator.


source

add_wc


def add_wc(
    q:str, # query to be passed for fts search
):

Add wild card to each word in the query.*


source

clean


def clean(
    q:str, # query to be passed for fts search
    pattern:str='[*,"\\(\\)\\^_]|-(?=\\S)', # regex pattern to use to replace with space
):

Clean the query by removing and returning None for empty queries.*

You can clean queries passed into fts search using clean, add wild cards using add_wc, widen the query using mk_wider and extract keywords using kw. You can combine all these using pre function.

q = 'This is a sample query'
print('preprocessed q with defaults: `%s`' %pre(q))
print('keywords extracted: `%s`' %pre(q, wc=False, wide=False))
print('q with wild card: `%s`' %pre(q, extract_kw=False, wide=False, wc=True))
print('remove _ : `%s`' %pre('This_is_a_query_with_underscores with_uv', extract_kw=False))
preprocessed q with defaults: `query* OR sample*`
keywords extracted: `query sample`
q with wild card: `This* is* a* sample* query*`
remove _ : `This* is* a* query* with* underscores* with* uv*`

Image → PDF

img2png normalises any image input (PIL, bytes, or path) to RGB PNG bytes. png_det parses the PNG header and IDAT stream. images_to_pdf wraps a list of images into a conformant multi-page image-only PDF without any external dependencies.

Note: pdf_oxide.Pdf.from_image* methods are broken in 0.3.x (produce an empty 1.3 KB shell). These functions build a conformant PDF directly from the PNG spec using FlateDecode + Predictor 15.


source

images_to_pdf


def images_to_pdf(
    imgs, # list of PIL Images, bytes, or file paths
    output:NoneType=None, # path to save, or None to return bytes
)->bytes:

Convert a list of images into a multi-page image-only PDF (one image per page).


source

png_det


def png_det(
    png:bytes
):

Return (width, height, concatenated IDAT bytes) from a PNG.


source

img2png


def img2png(
    img
)->bytes:

Normalise image input (PIL Image / bytes / path) to PNG bytes as RGB.

import tempfile, os
# find PDF relative to nbs/ (normal Jupyter) or project root (nbdev_test)
_pdf = next((p for p in ['pdfs/attention_is_all_you_need.pdf', 'nbs/pdfs/attention_is_all_you_need.pdf'] if Path(p).exists()), None)
assert _pdf, 'attention_is_all_you_need.pdf not found'

arxiv = PdfDocument(_pdf)

# pdf_texts and pdf_markdown
md_pages = arxiv.pdf_markdown()
assert len(md_pages) == arxiv.page_count(), f'Expected {arxiv.page_count()} pages, got {len(md_pages)}'
assert any('attention' in p.lower() for p in md_pages)

txt_pages = arxiv.pdf_texts()
assert len(txt_pages) == arxiv.page_count()
assert any('attention' in t.lower() for t in txt_pages)

links = arxiv.pdf_links()
assert isinstance(links, L)
print(f'pdf_texts: {len(txt_pages)} pages, pdf_markdown: {len(md_pages)} pages, pdf_links: {len(links)} links')

# images_to_pdf: build synthetic scanned PDF from a real page image
with tempfile.TemporaryDirectory() as tmp:
    arxiv.to_markdown(2, include_images=True, embed_images=False, image_output_dir=tmp)
    img_bytes = open(os.path.join(tmp, os.listdir(tmp)[0]), 'rb').read()

_scanned = Path(_pdf).parent/'scanned_test.pdf'
images_to_pdf([img_bytes], output=str(_scanned))
scanned = PdfDocument(str(_scanned))
assert scanned.extract_text(0).strip() == '', 'scanned PDF should have no native text'
assert _scanned.stat().st_size > 100_000, 'scanned PDF should contain the embedded image'
print('images_to_pdf test passed:', _scanned)
pdf_texts: 15 pages, pdf_markdown: 15 pages, pdf_links: 18 links
images_to_pdf test passed: pdfs/scanned_test.pdf

source

mv_skill_md


def mv_skill_md(
    dry_run:bool=True, dir:NoneType=None
)->None:

Copy bundled SKILL.md to skill directories (.agents/, .claude/, .Codex/).

import tempfile, importlib.util as _iu
from pathlib import Path as _Path

# Locate the installed litesearch package — works regardless of cwd
_spec = _iu.find_spec('litesearch')
skill_src = _Path(_spec.submodule_search_locations[0]) / 'SKILL.md'
assert skill_src.exists(), f'SKILL.md not found at {skill_src.resolve()}'

with tempfile.TemporaryDirectory() as tmp:
    tmp = _Path(tmp)
    for d in ['.agents', '.claude', '.Codex']:
        dest = tmp / d / 'skills' / 'litesearch' / 'SKILL.md'
        dest.parent.mkdir(parents=True, exist_ok=True)
        dest.write_text(skill_src.read_text())
    for d in ['.agents', '.claude', '.Codex']:
        dest = tmp / d / 'skills' / 'litesearch' / 'SKILL.md'
        assert dest.exists(), f'Missing: {dest}'
        assert 'litesearch' in dest.read_text()
print('mv_skill_md install paths ok')
mv_skill_md install paths ok