data

some utilities to aid data extraction and query preprocessing

We will build a simple ingestion pipeline to ingest pdf documents into litesearch database for searching.

Extensions to pymupdf Document and Page classes to extract texts, images and links

/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/usearch/__init__.py:125: UserWarning: Will download `usearch_sqlite` binary from GitHub.
  warnings.warn("Will download `usearch_sqlite` binary from GitHub.", UserWarning)

source

Document.ext_imgs


def ext_imgs(
    st:int=0, end:int=-1
):

source

Document.ext_im


def ext_im(
    it:NoneType=None
):

source

Document.get_texts


def get_texts(
    st:int=0, end:int=-1, kw:VAR_KEYWORD
):

Code extraction utilities


source

pyparse


def pyparse(
    p:Path=None, # path to a python file
    code:str=None, # code string to parse
    imports:bool=False, # include import statements as code chunks
)->L:

Parse a code string or python file and return code chunks as list of dicts with content and metadata.

You can use pyparse to extract code chunks from a python file or code string.

txt = """
from fastcore.all import *
a=1
class SomeClass:
    def __init__(self,x): store_attr()
    def method(self): return self.x + a
 """
pyparse(code=txt)
(#2) [{'content': 'a=1', 'metadata': {'path': None, 'uploaded_at': None, 'name': None, 'type': 'Assign', 'lineno': 3, 'end_lineno': 3}},{'content': 'class SomeClass:\n    def __init__(self,x): store_attr()\n    def method(self): return self.x + a', 'metadata': {'path': None, 'uploaded_at': None, 'name': 'SomeClass', 'type': 'ClassDef', 'lineno': 4, 'end_lineno': 6}}]

Setting imports to True will also include import statements as code chunks.

pyparse(code=txt, imports=True)
(#3) [{'content': 'from fastcore.all import *', 'metadata': {'path': None, 'uploaded_at': None, 'name': None, 'type': 'ImportFrom', 'lineno': 2, 'end_lineno': 2}},{'content': 'a=1', 'metadata': {'path': None, 'uploaded_at': None, 'name': None, 'type': 'Assign', 'lineno': 3, 'end_lineno': 3}},{'content': 'class SomeClass:\n    def __init__(self,x): store_attr()\n    def method(self): return self.x + a', 'metadata': {'path': None, 'uploaded_at': None, 'name': 'SomeClass', 'type': 'ClassDef', 'lineno': 4, 'end_lineno': 6}}]

source

pkg2chunks


def pkg2chunks(
    pkg:str, # package name
    imports:bool=False, # include import statements as code chunks
    kw:VAR_KEYWORD
)->L: # additional args to pass to pkg2files

Return code chunks from a package with extra metadata.


source

pkg2files


def pkg2files(
    pkg:str, # package name
    file_glob:str='*.py', # file glob to match
    skip_file_glob:str='_*', # file glob to skip
    skip_file_re:str='(^__init__\\.py$|^setup\\.py$|^conftest\\.py$|^test_.*\\.py$|^tests?\\.py$|^.*_test\\.py$)', # regex to skip files
    skip_folder_re:str='(^tests?$|^__pycache__$|^\\.eggs$|^\\.mypy_cache$|^\\.tox$|^examples?$|^docs?$|^build$|^dist$|^\\.git$|^\\.ipynb_checkpoints$)', # regex to skip folders
    path:Path \| str='.', # path to start searching
    recursive:bool=True, # search subfolders
    symlinks:bool=True, # follow symlinks?
    file_re:str=None, # Only include files matching regex
    folder_re:str=None, # Only enter folders matching regex
    func:callable=<function join at 0x7fa21e91ccc0>, # function to apply to each matched file
    ret_folders:bool=False, # return folders, not just files
    sort:bool=True, # sort files by name within each folder
    types:str \| list=None, # list or comma-separated str of ext types from: py, js, java, c, cpp, rb, r, ex, sh, web, doc, cfg
    exts:str \| list=None, # list or comma-separated str of exts to include
)->L: # additional args to pass to globtastic

Return list of python files in a package excluding tests and setup files.

pkg2chunks can be used to extract code chunks from an entire package installed in your environment.

chunks=pkg2chunks('fastlite')
chunks.filter(lambda d: d['metadata']['type']=='FunctionDef')[0]
{'content': 'def t(self:Database): return _TablesGetter(self)',
 'metadata': {'path': '/Users/71293/code/litesearch/.venv/lib/python3.13/site-packages/fastlite/core.py',
  'uploaded_at': 1752468812.9739048,
  'name': 't',
  'type': 'FunctionDef',
  'lineno': 44,
  'end_lineno': 44,
  'package': 'fastlite',
  'version': '0.2.1'}}

source

installed_packages


def installed_packages(
    nms:list=None, # list of package names
)->L:

Return list of installed packages. If nms is provided, return only those packages.

Get list of installed packages in your environment using installed_packages. If you pass a list of package names, it only returns them if they exist in your environment.

installed_packages(['fstlite']) # non existent package
installed_packages(['fastlite']) # existing package
installed_packages() # all installed packages that are not stdlib
(#179) ['litesearch','shellingham','jiter','ipykernel','simsimd','threadpoolctl','coloredlogs','uri-template','humanfriendly','socksio','rfc3339-validator','pexpect','jupyterlab-quarto','fqdn','requests','babel','rich','traitlets','tokenizers','urllib3'...]

Query Preprocessing utilities


source

pre


def pre(
    q:str, # query to be passed for fts search
    wc:bool=True, # add wild card to each word
    wide:bool=True, # widen the query with OR operator
    extract_kw:bool=True, # extract keywords from the query
):

Preprocess the query for fts search.


source

kw


def kw(
    q:str, # query to be passed for fts search
):

Extract keywords from the query using YAKE library.


source

mk_wider


def mk_wider(
    q:str, # query to be passed for fts search
):

Widen the query by joining words with OR operator.


source

add_wc


def add_wc(
    q:str, # query to be passed for fts search
):

Add wild card to each word in the query.*


source

clean


def clean(
    q:str, # query to be passed for fts search
):

Clean the query by removing and returning None for empty queries.*

You can clean queries passed into fts search using clean, add wild cards using add_wc, widen the query using mk_wider and extract keywords using kw. You can combine all these using pre function.

q = 'This is a sample query'
print('preprocessed q with defaults: `%s`' %pre(q))
print('keywords extracted: `%s`' %pre(q, wc=False, wide=False))
print('q with wild card: `%s`' %pre(q, extract_kw=False, wide=False, wc=True))
preprocessed q with defaults: `query* OR sample*`
keywords extracted: `query sample`
q with wild card: `This* is* a* sample* query*`