Source code for ir

"""``ir`` — an information-retrieval substrate for agentic systems.

One uniform "find the relevant things in this corpus" contract that scales from
an ad-hoc search over an ephemeral list to a maintained search engine. Retrieval
is the core; selection/expansion/reranking/generation are layered on top.

Quick start::

    import ir

    # Define a corpus source (abstract strategy + parameters, smart defaults):
    source = ir.CorpusSource.from_md_reports()          # project docs/ reports
    corpus = ir.build(source)                            # index (incremental)
    hits = ir.search(corpus, "how do I deploy the app")  # ranked SearchHits

    # Light, dependency-free embedding for fast tests:
    corpus = ir.build(source, embedder="light")

A corpus source is defined by a ``scope`` (what is in the corpus), a
``change_signal`` (what counts as stale), an ``indexing_strategy`` (how a raw
item becomes filter fields + embeddable surfaces), and an ``embedder``. The
default embedder is a decent *local* model (``all-MiniLM-L6-v2``); ``"light"``
selects a numpy-only hashing embedder. Data persists under XDG dirs through a
``dol`` repository layer.
"""

from __future__ import annotations

from . import embed as _embed  # noqa: F401  (sets USE_TF=0 before transformers)
from . import registry
from .base import Artifact, IndexPlan, Record, SearchHit, Surface, tag_source
from .expand import (
    NeighborhoodPolicy,
    Passage,
    expand,
    parent_policy,
    sentence_window_policy,
)
from .formulate import Formulator, make_llm_formulator
from .graph import (
    CorpusGraph,
    EdgeExtractor,
    GraphStore,
    canonical_node_id,
    default_edge_extractor,
)
from .index import Corpus, build, open_corpus
from .registry import retriever_for, retrievers
from .retrieve import Retriever, as_retriever, fuse_hits, records_for_artifact
from .retrieve import search as _search
from .select import (
    Disclosure,
    DiscoveryResult,
    Selection,
    disclose,
    discover,
    select,
)
from .sources import CorpusSource
from .store import CorpusStore
from .strategy import Chunked, IndexingStrategy, Package, Skill, WholeText
from .synopsis import Synthesizer, make_llm_synthesizer, with_synopsis
from .traverse import WalkPolicy, WalkState, collapsed_tree_policy, traverse

__all__ = [
    "Artifact",
    "Surface",
    "Record",
    "SearchHit",
    "IndexPlan",
    "IndexingStrategy",
    "WholeText",
    "Chunked",
    "Skill",
    "Package",
    "with_synopsis",
    "make_llm_synthesizer",
    "Synthesizer",
    "CorpusSource",
    "CorpusStore",
    "Corpus",
    "build",
    "open_corpus",
    "search",
    "as_retriever",
    "Retriever",
    "retrievers",
    "retriever_for",
    "fuse_hits",
    "records_for_artifact",
    "GraphStore",
    "CorpusGraph",
    "EdgeExtractor",
    "default_edge_extractor",
    "canonical_node_id",
    "traverse",
    "WalkPolicy",
    "WalkState",
    "collapsed_tree_policy",
    "expand",
    "Passage",
    "NeighborhoodPolicy",
    "sentence_window_policy",
    "parent_policy",
    "tag_source",
    "make_llm_formulator",
    "Formulator",
    "select",
    "disclose",
    "discover",
    "Selection",
    "Disclosure",
    "DiscoveryResult",
    "register",
    "corpora",
    "build_corpus",
]

register = registry.register
corpora = registry.registered


[docs] def build_corpus(name, **kwargs): """Build (or update) a registered/preset corpus by *name*; returns a :class:`~ir.index.Corpus`. ``**kwargs`` are forwarded to :func:`ir.build` — notably ``store``, ``embedder`` (e.g. ``"light"`` for the numpy-only hashing embedder), ``full`` (prune artifacts no longer in the source), and ``batch_size``. """ return build(registry.source_for(name), **kwargs)
# The evaluation harness is reachable as ``ir.eval`` (its ``ef`` imports are # lazy, so this does not weigh down ``import ir``). Kept out of ``__all__`` so a # star-import does not shadow the ``eval`` builtin. ``ir.eval_gen`` is the # build-time case generator (its ``oa`` import is lazy too). from . import eval # noqa: E402,F401 (submodule attribute: ir.eval) from . import eval_gen # noqa: E402,F401 (submodule attribute: ir.eval_gen)