Source code for ir.cli

"""Command-line surface for ``ir`` (argh-dispatched).

Commands operate on **named** corpora from the registry (see
:mod:`ir.registry`)::

    ir build skills                 # build/update the skills preset corpus
    ir search skills "deploy app"   # rank candidates (retrieval only)
    ir discover skills "deploy app" # retrieve -> commit to a high-precision subset
    ir discover skills "deploy app" --disclose   # + load each selected item's body
    ir ls                           # list corpora + record counts
    ir info packages                # config + stats for a corpus
    ir register notes files --root ~/notes --pattern '.*\\.md$'
    ir rm notes                     # unregister (keeps built data)
    ir eval-gen skills skills_eval.jsonl --k 5        # generate cases (needs aix/LLM)
    ir eval skills skills_eval.jsonl --mode hybrid    # score retrieval on a case file
    ir eval-select skills skills_eval.jsonl           # score the selection stage
    ir sweep-select skills skills_eval.jsonl          # tune max_k/rel for the selector
    ir calibrate-min-score skills skills_eval.jsonl --persist  # abstention floor
    ir discover skills "deploy app" --min-score auto  # use the calibrated floor
"""

from __future__ import annotations

from . import registry
from .eval import DFLT_CALIB_SENSITIVITY_WEIGHT
from .index import build as _build
from .index import open_corpus


def _empty_corpus_msg(name) -> str:
    """The shared 'corpus is empty, build it first' guard message."""
    return f"corpus {name!r} is empty; build it first: ir build {name}"


def _drift_warning(drift, name) -> str:
    """A warning suffix when cases reference gold ids absent from the corpus.

    Returns ``""`` when there is no drift, so call sites can ``out +=`` it
    unconditionally.
    """
    if not drift:
        return ""
    return (
        f"\n  WARNING: {len(drift)} case(s) reference gold ids absent from "
        f"corpus {name!r} (stale fixture?); their misses are not real."
    )



[docs]
def ls():
    """List registered corpora with their kind, embedder, and record count."""
    entries = registry.registered()
    if not entries:
        return "No corpora registered. Try: ir build skills"
    lines = []
    for name, e in entries.items():
        try:
            count = len(open_corpus(name))
        except Exception:
            count = 0
        lines.append(
            f"{name:18} {e['kind']:10} {e.get('embedder', 'default'):10} records={count}"
        )
    return "\n".join(lines)




[docs]
def register(
    name,
    kind,
    *,
    root=None,
    pattern=None,
    embedder="default",
    reindex_on=None,
    every_hours=None,
    synopsis=False,
):
    """Register a named corpus. kind: skills | packages | reports | files.

    Background-work policy (optional; smart per-kind defaults otherwise — see
    `ir.policy`): reindex_on (source-change | interval | manual), every_hours (for
    interval), synopsis (enable LLM synopses, run only in the policy's downtime
    window by `ir maintain`).
    """
    params = {}
    if root:
        params["root"] = root
    if pattern:
        params["pattern"] = pattern
    maintenance = None
    if reindex_on or every_hours or synopsis:
        reindex = {}
        if every_hours:
            reindex = {"on": "interval", "every_hours": float(every_hours)}
        elif reindex_on:
            reindex = {"on": reindex_on}
        maintenance = {"reindex": reindex, "synopsis": {"enabled": bool(synopsis)}}
    registry.register(name, kind, embedder=embedder, maintenance=maintenance, **params)
    return f"registered {name!r} (kind={kind}, embedder={embedder})"




[docs]
def build(name, *, embedder=None, full=True):
    """Build or incrementally update a registered (or preset) corpus."""
    source = registry.source_for(name)
    corpus = _build(source, embedder=embedder, full=full)
    return f"built {name!r}: {len(corpus)} records (embedder {corpus.embedder_id})"




[docs]
def search(name, query, *, k=10, mode="dense", fusion="rrf"):
    """Search a built corpus and print the top-k hits.

    mode: dense (cosine) | lexical (BM25) | hybrid (dense + BM25 fused). fusion
    (hybrid only): rrf (rank-based, default) | blend (magnitude-preserving;
    better abstention separability — see ir_08).
    """
    corpus = open_corpus(name)
    if len(corpus) == 0:
        return _empty_corpus_msg(name)
    lines = []
    for h in corpus.search(query, k=k, mode=mode, fusion=fusion):
        # artifact_id is the unique label across corpora (skill name[@parent],
        # package name, or a report's relative path).
        lines.append(f"{h.score:+.3f}  {h.artifact_id}  [{h.surface_kind}]")
    return "\n".join(lines) or "(no matches)"




[docs]
def discover(
    name,
    query,
    *,
    k=10,
    mode="hybrid",
    strategy="conservative",
    disclose=False,
    min_score=None,
    fusion="rrf",
):
    """Search a corpus, commit to a distractor-robust subset, and show it.

    Retrieves ``k`` candidates, then *selects* the few high-precision results an
    agent should act on (or abstains). ``--disclose`` additionally loads each
    selected item's body (SKILL.md / file text) via its stored pointer.
    ``--min-score auto`` turns on absolute abstention using the floor calibrated
    by ``ir calibrate-min-score`` (or pass a float). mode: dense | lexical |
    hybrid. fusion (hybrid only): rrf (default) | blend (magnitude-preserving;
    pair with --min-score for abstention — see ir_08). strategy: conservative |
    top_k | rel_threshold | score_gap.
    """
    from .select import discover as _discover

    corpus = open_corpus(name)
    if len(corpus) == 0:
        return _empty_corpus_msg(name)
    floor = (
        "auto"
        if min_score == "auto"
        else (None if min_score is None else float(min_score))
    )
    result = _discover(
        corpus,
        query,
        k=k,
        mode=mode,
        strategy=strategy,
        disclose_level="body" if disclose else "metadata",
        min_score=floor,
        fusion=fusion,
    )
    if result.abstained:
        return (
            f"(abstained: {result.reason}; {result.n_retrieved} candidates retrieved)"
        )
    lines = [
        f"selected {len(result.results)}/{result.n_retrieved} "
        f"({result.strategy} / {result.mode}, {result.reason}):"
    ]
    for d in result.results:
        lines.append(f"  {d.score:+.3f}  {d.name}")
        if disclose and d.body:
            preview = d.body.strip().replace("\n", " ")[:160]
            lines.append(f"           {preview}…")
    return "\n".join(lines)




[docs]
def info(name):
    """Show a corpus's stored config, stats, policy, and any abstention floors."""
    corpus = open_corpus(name)
    cfg = corpus.store.get_config()
    reg = registry.get(name)
    calibrated = corpus.store.calibration_modes()
    floors = {m: corpus.store.get_calibration(m).get("min_score") for m in calibrated}
    cal = f"\nmin_score floors: {floors}" if floors else ""
    pol = registry.policy_for(name)
    state = corpus.store.get_maintenance_state()
    last = state.get("last_maintained", "never")
    syn = pol.synopsis
    syn_str = f"enabled, scope={syn.scope}/{syn.window_days}d" if syn.enabled else "off"
    window = f", downtime={syn.downtime_hours}" if syn.downtime_hours else ""
    policy_str = (
        f"\npolicy: reindex={pol.reindex.on}"
        + (f"/{pol.reindex.every_hours}h" if pol.reindex.every_hours else "")
        + f", synopsis={syn_str}{window}\nlast maintained: {last}"
    )
    return (
        f"name: {name}\nregistered: {reg}\nrecords: {len(corpus)}\n"
        f"config: {cfg}{policy_str}{cal}"
    )




[docs]
def maintain(name=None, *, all=False, dry_run=False):
    """Run due background work: incremental reindex, synopsis in its downtime window.

    With a name, maintains that corpus; with --all (or no name), every registered
    corpus. Idempotent and safe to schedule (cron/launchd): it no-ops what is not
    due. --dry-run reports what would run without doing it.
    """
    from .maintenance import maintain as _maintain

    results = _maintain(name=name, all=all, dry_run=dry_run)
    if not results:
        return "no corpora registered"
    return "\n".join(str(r) for r in results)




[docs]
def rm(name):
    """Unregister a corpus (does not delete its built data)."""
    registry.unregister(name)
    return f"unregistered {name!r}"




[docs]
def eval(name, cases, *, mode="hybrid", k=10):
    """Score a built corpus's retrieval against a DiscoveryCase JSONL file.

    cases: path to a JSONL file of cases (see :mod:`ir.eval`); each line is a
    ``{"query": ..., "gold": [artifact_id, ...]}`` record (empty ``gold`` = an
    abstention case). Prints recall@k / NDCG@k / MRR / MAP plus the failure-mode
    taxonomy. mode: dense | lexical | hybrid.
    """
    from .eval import evaluate_discovery, load_cases, validate_cases

    corpus = open_corpus(name)
    if len(corpus) == 0:
        return _empty_corpus_msg(name)
    case_list = load_cases(cases)
    if not case_list:
        return f"no cases found in {cases!r}"
    drift = validate_cases(corpus, case_list)
    report = evaluate_discovery(
        corpus, case_list, mode=mode, primary_k=k, k_values=tuple(sorted({1, 5, k}))
    )
    out = str(report)
    out += _drift_warning(drift, name)
    return out




[docs]
def eval_gen(name, out, *, k=5, abstention_frac=0.15, max_artifacts=None):
    """Generate an eval-case file for a corpus by back-translation (needs aix/LLM).

    Writes a DiscoveryCase JSONL set (gold cases + an abstention slice) for the
    registered corpus *name* to *out*, stamping a corpus-signature into the
    header so the frozen file can be checked against the live corpus later. This
    command calls an LLM via aix; scoring it afterwards (`ir eval`) is offline.
    """
    from .eval import save_cases
    from .eval_gen import build_eval_set, corpus_signature

    source = registry.source_for(name)
    kwargs = {}
    if max_artifacts is not None:
        kwargs["max_artifacts"] = int(max_artifacts)
    cases = build_eval_set(
        source, k=k, abstention_frac=abstention_frac, corpus_name=name, **kwargs
    )
    save_cases(
        cases,
        out,
        meta={"corpus": name, "corpus_signature": corpus_signature(source), "k": k},
    )
    n_gold = sum(not c.gold_is_none for c in cases)
    return (
        f"wrote {len(cases)} cases ({n_gold} gold, {len(cases) - n_gold} abstention) "
        f"to {out!r}"
    )




[docs]
def eval_select(
    name,
    cases,
    *,
    strategy="conservative",
    mode="hybrid",
    k=10,
    max_k=3,
    rel=0.9,
    min_score=None,
):
    """Score a selector against a DiscoveryCase JSONL file (selection quality).

    Reports the conditional commit rate (the selection decision isolated from
    retrieval) plus selection precision / recall / F1 and abstention accuracy.
    strategy: conservative | top_k | rel_threshold | score_gap. mode: dense |
    lexical | hybrid. max_k / rel / min_score tune the commit (see `ir.select`);
    `ir sweep-select` sweeps them to find good values.
    """
    from .eval import evaluate_selection, load_cases, validate_cases

    corpus = open_corpus(name)
    if len(corpus) == 0:
        return _empty_corpus_msg(name)
    case_list = load_cases(cases)
    if not case_list:
        return f"no cases found in {cases!r}"
    drift = validate_cases(corpus, case_list)
    report = evaluate_selection(
        corpus,
        case_list,
        strategy=strategy,
        mode=mode,
        k=k,
        max_k=int(max_k),
        rel=float(rel),
        min_score=None if min_score is None else float(min_score),
    )
    out = str(report)
    out += _drift_warning(drift, name)
    return out




[docs]
def sweep_select(
    name,
    cases,
    *,
    strategy="conservative",
    mode="hybrid",
    k=10,
    objective="selection_f1",
):
    """Sweep selector knobs (max_k × rel) against a case file; print the grid + best.

    Retrieves once per case and reuses the candidates across the whole grid, so
    the sweep is cheap. Prints a table (one row per setting, best objective first)
    and the winning max_k / rel. Use it to pick `ir.select` defaults empirically.
    objective: selection_f1 | selection_precision | selection_recall |
    conditional_commit_rate | mean_selected_size. mode: dense | lexical | hybrid.
    """
    from .eval import load_cases, sweep_selector, validate_cases

    corpus = open_corpus(name)
    if len(corpus) == 0:
        return _empty_corpus_msg(name)
    case_list = load_cases(cases)
    if not case_list:
        return f"no cases found in {cases!r}"
    drift = validate_cases(corpus, case_list)
    sweep = sweep_selector(
        corpus, case_list, strategy=strategy, mode=mode, k=k, objective=objective
    )
    out = str(sweep)
    out += _drift_warning(drift, name)
    return out




[docs]
def calibrate_min_score(
    name,
    cases,
    *,
    mode="hybrid",
    k=10,
    sensitivity_weight=DFLT_CALIB_SENSITIVITY_WEIGHT,
    persist=False,
    fusion="rrf",
):
    """Calibrate the absolute abstention min_score floor for a corpus + mode.

    Separates in-scope (gold-bearing) from out-of-scope (empty-gold) query top
    scores and reports the floor that best splits them, with sensitivity /
    specificity / Youden's J. ``cases`` must include BOTH gold-bearing and
    abstention cases — generate them with ``ir eval-gen`` (it adds an abstention
    slice). ``--persist`` stores the floor so ``ir discover ... --min-score auto``
    will abstain by it. ``--sensitivity-weight`` (0..1, balanced default);
    lower it to abstain more readily (precision-leaning). For hybrid, calibrate
    with ``--fusion blend`` (rank-based RRF barely separates — see ir_08) and
    query under the same fusion. mode: dense | lexical | hybrid. fusion (hybrid
    only): rrf | blend.
    """
    from .eval import calibrate_min_score as _calibrate
    from .eval import load_cases, validate_cases

    corpus = open_corpus(name)
    if len(corpus) == 0:
        return _empty_corpus_msg(name)
    case_list = load_cases(cases)
    if not case_list:
        return f"no cases found in {cases!r}"
    drift = validate_cases(corpus, case_list)
    calib = _calibrate(
        corpus,
        case_list,
        mode=mode,
        k=int(k),
        sensitivity_weight=float(sensitivity_weight),
        persist=bool(persist),
        fusion=fusion,
    )
    out = str(calib)
    if persist and calib.min_score is not None:
        fusion_hint = (
            f" --fusion {fusion}" if mode == "hybrid" and fusion != "rrf" else ""
        )
        out += (
            f"\n  persisted → ir discover {name} ... --mode {mode}{fusion_hint} "
            f"--min-score auto"
        )
    if calib.min_score is None:
        out += (
            f"\n  NOTE: no floor calibrated ({calib.reason}); a floor needs both "
            f"gold-bearing and abstention cases. Add abstention cases via ir eval-gen."
        )
    out += _drift_warning(drift, name)
    return out



COMMANDS = [
    ls,
    register,
    build,
    search,
    discover,
    info,
    maintain,
    rm,
    eval,
    eval_gen,
    eval_select,
    sweep_select,
    calibrate_min_score,
]