core

web search made easy

core is the fetch-and-read layer. fetch() handles any URL from a simple static page to a JavaScript-rendered SPA; to_md() strips HTML to clean markdown. The module also covers pagination across JSON APIs, reading arXiv papers and YouTube transcripts, and cloning GitHub repos.


source

save_path


def save_path(
    path:NoneType=None
):

Get cache path for name (e.g. ‘arxiv’ or ‘fetch’)


source

http_post


def http_post(
    url, kw:VAR_KEYWORD
):

Call self as a function.


source

http_get


def http_get(
    url, kw:VAR_KEYWORD
):

Call self as a function.


source

get_pdf


def get_pdf(
    url:str
):

Fetch PDF from URL and return as PdfDocument


source

read_arxiv


def read_arxiv(
    url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID
    save_pdf:bool=True, # if True, saves the downloaded PDF to disk
    save_dir:str='.', # directory in which to save the PDF
    force:bool=False, # if True, forces re-download of PDF even if it exists on disk
):

Get paper information from arxiv URL or ID, optionally saving PDF to disk


source

read_gh_repo


def read_gh_repo(
    path_or_url:str, # GitHub URL, SSH address, or local path
    globs:tuple=None, # file glob patterns (default: README*, pyproject.toml, *.py)
    limit:int=None, # max files to return
    as_list:bool=False, # return list of Paths instead of {path: content} dict
):

Read files from a GitHub repo filtered by glob patterns


source

read_gh_file


def read_gh_file(
    url:str, # GitHub blob URL of the file to read
):

Read raw contents of a file from its GitHub URL

read_arxiv('https://arxiv.org/abs/2306.14881')['summary'][:200]
'Low-metallicity dwarf galaxies often show no or little CO emission, despite the intense star formation observed in local samples. Both simulations and resolved observations indicate that molecular gas'
read_gh_file('https://github.com/Karthik777/litesearch/blob/main/README.md')[:200]
'# litesearch\n\n\n<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->\n\n> **NB** Reading this on GitHub? The formatted\n> [documentation](https://Karthik777.github.io/litesearch/) is nicer.\n\nlitese'
list(read_gh_repo('https://github.com/vedicreader/gheasy'))
['/Users/71293/.cache/.fossick/git_clones/gheasy/README.md',
 '/Users/71293/.cache/.fossick/git_clones/gheasy/pyproject.toml',
 '/Users/71293/.cache/.fossick/git_clones/gheasy/gheasy/__init__.py',
 '/Users/71293/.cache/.fossick/git_clones/gheasy/gheasy/_modidx.py',
 '/Users/71293/.cache/.fossick/git_clones/gheasy/gheasy/core.py',
 '/Users/71293/.cache/.fossick/git_clones/gheasy/gheasy/workflow.py']

Web Fetching

fetch() returns a dict with url, status, html, data (parsed JSON when the response is JSON), and xhr (captured network calls when capture_xhr=True). to_md() produces clean markdown, optionally extracting just the element matched by a CSS selector.


source

to_md


def to_md(
    page_or_html, # Page dict (from fetch/crawl) or raw HTML string
    sel:str=None, # CSS selector to extract before conversion; returns '' if no match
    multi:bool=False, # Return all selector matches joined
    wrap_tag:str=None, # Wrap each multi-result in <wrap_tag>...</wrap_tag>; only used when multi=True
    ignore_links:bool=True, rm_comments:bool=True, rm_details:bool=True
)->str:

Convert a Page dict or HTML string to clean markdown


source

html2md


def html2md(
    s:str, ignore_links:bool=True
):

Convert s from HTML to markdown


source

clean_md


def clean_md(
    text, rm_comments:bool=True, rm_details:bool=True
):

Remove comments and <details> sections from text


source

fetch


def fetch(
    url:str, # URL to fetch
    sel:str=None, # CSS selector to extract (None = full page)
    method:str='GET', # HTTP method; 'POST' sends payload as JSON body
    payload:dict=None, # POST body (JSON) or GET query params
    heavy:bool=False, # Full JS rendering via headless browser
    stealthy:bool=False, # Anti-bot stealth fetcher (Cloudflare etc.)
    capture_xhr:bool=False, # Intercept XHR/fetch calls; forces heavy=True
    cache:bool=False, # Cache successful responses to disk by URL+sel
    force:bool=False, # If True, forces re-fetch even if cached result exists
    kw:VAR_KEYWORD
)->dict: # Extra kwargs passed to scrapling (e.g. verify, headers)

Fetch url, return Page dict {url, status, html, data, xhr} where html is raw response body


source

crawl


def crawl(
    start_url:str, # URL to start from
    sel:str=None, # CSS selector to extract per page
    follow_sel:str='a[href]', # CSS selector for links to follow
    same_domain:bool=True, # Only follow links on same domain
    max_pages:int=10, # Max pages to visit
    delay:float=0, # Seconds to wait between requests (polite crawling)
    heavy:bool=False, stealthy:bool=False, kw:VAR_KEYWORD
)->list: # Extra kwargs passed to scrapling (e.g. verify, timeout)

Crawl from start_url, following follow_sel links, return list of Page dicts


source

fetch_all


def fetch_all(
    urls:list, # URLs to fetch
    sel:str=None, # CSS selector to extract per page (None = full page)
    concurrency:int=8, # Max parallel fetches
    heavy:bool=False, stealthy:bool=False, kw:VAR_KEYWORD
)->list: # Extra kwargs passed to fetch()

Fetch a list of URLs in parallel; returns Page dicts in the same order as urls


source

get_options


def get_options(
    page_or_html, # Page dict (from fetch) or raw HTML string
    sel:str, # CSS selector for the <select> element
)->list:

Extract options from a