"""The semantic link graph — typed edges between artifacts (report 12).
ir indexes artifacts that *refer to each other*: a package depends on other
packages, a skill belongs to a parent, a chunk follows the previous one, a
report cites another. This module models those references as **typed, directed
edges** between nodes identified by ``(source, artifact_id)`` (ir's canonical
node identity since 0.1.16), queryable at retrieval time — the substrate the
traversal operator (``traverse``, follow-up) walks.
Two pieces:
- the :data:`GraphStore` **protocol** — a minimal ``__getitem__`` +
``neighbors`` structural contract, so a traversal operator binds to *any*
conforming store, not only ir's;
- :class:`CorpusGraph` — the corpus-backed implementation. Node payloads are
the artifact's stored records (via :func:`ir.retrieve.records_for_artifact`);
neighbors come from the ``links`` view on :class:`~ir.store.CorpusStore`.
Edges are **regenerable derived state** (persisted in the ``links`` view, like
``calibration``) — never part of build identity. They are ingested at build
time by an injectable :data:`EdgeExtractor`; the shipped
:func:`default_edge_extractor` turns the edge data already latent in the index
(``Package`` ``deps`` → ``REF``, ``Skill`` ``parent`` → ``PARENT``) into edges.
Vocabulary (documented, **not** enforced): :data:`NEXT` / :data:`PREV` /
:data:`PARENT` / :data:`CHILD` / :data:`REF`. ``NEXT`` / ``PREV`` are *not*
materialized — they are derivable from the ledger's surface order (#44); the
view stores only what isn't derivable.
**Naming guard** (ADR #43): this is the **semantic link graph** — directed,
**possibly cyclic** (citations, cross-references), traversed at query time. It
is *not* ``ef.artifact_graph``, which is an acyclic build-time
derivation/lineage DAG. The id conventions are kept compatible so a later
unification stays open, but the two are distinct substrates.
"""
from __future__ import annotations
import re
from collections.abc import Iterable, Mapping
from typing import Any, Callable, Protocol, runtime_checkable
from .base import Record
from .retrieve import records_for_artifact
# ---- edge vocabulary (documented, not enforced) --------------------------- #
#: Next sibling surface in sequence (derivable from the ledger — not stored).
NEXT = "NEXT"
#: Previous sibling surface in sequence (derivable from the ledger — not stored).
PREV = "PREV"
#: Containment up: a child points at its parent (e.g. a skill → its package).
PARENT = "PARENT"
#: Containment down: a parent points at a child.
CHILD = "CHILD"
#: A generic reference / dependency / citation (e.g. a package → its deps).
REF = "REF"
#: A node's identity: ``artifact_id`` within one corpus, or ``(source,
#: artifact_id)`` across corpora. The visited-set of a traversal keys on this.
NodeId = "str | tuple[str, str]"
[docs]
@runtime_checkable
class GraphStore(Protocol):
"""Structural contract a traversal operator binds to — node + neighbors.
Deliberately minimal (two methods) so it is satisfied by ir's
:class:`CorpusGraph` *and* by any external graph: ``__getitem__`` resolves
a node id to its scorable payload, ``neighbors`` lists adjacent node ids
(optionally of one edge type). Granularity-agnostic on purpose — an
artifact graph and a surface-level tree are both ``GraphStore``s.
``runtime_checkable`` makes ``isinstance(x, GraphStore)`` a *structural*
check on attribute **names** only (not signatures) — enough to tell a
conforming adapter from an arbitrary object, but it cannot validate that
``neighbors`` takes the right arguments; treat it as a smoke check.
"""
def __getitem__(self, node_id: Any) -> Any: ...
def neighbors(self, node_id: Any, *, edge_type: str | None = None) -> Iterable: ...
[docs]
class CorpusGraph:
"""A :data:`GraphStore` over one corpus — artifact nodes, ``links`` edges.
``node_id`` is an ``artifact_id``. ``graph[aid]`` is the artifact's stored
records (its scorable surfaces, in plan order); ``graph.neighbors(aid,
edge_type=...)`` reads the corpus store's ``links`` view. Single-corpus, so
it resolves **intra-corpus** targets; cross-corpus ``[source, artifact_id]``
targets are returned by :meth:`neighbors` verbatim but ``__getitem__`` only
dereferences ids in *this* corpus (federated traversal is a follow-up).
"""
def __init__(self, store_or_corpus: Any):
self.store = getattr(store_or_corpus, "store", store_or_corpus)
#: The corpus name, when known — the ``source`` half of this graph's
#: node identities (``None`` for a bare store).
self.source = getattr(store_or_corpus, "name", None)
def __getitem__(self, node_id: str) -> list[Record]:
"""The artifact's records, plan-ordered (``NoLedgerEntry`` if unknown)."""
return records_for_artifact(self.store, node_id)
[docs]
def neighbors(self, node_id: str, *, edge_type: str | None = None) -> list:
"""Outgoing neighbor ids of *node_id*, optionally of one *edge_type*.
Returns target ids in stored form (a bare ``artifact_id``, whose source
is *this* graph's :attr:`source`; or a ``[source, artifact_id]`` list
for a cross-corpus edge), de-duplicated with first-seen order
preserved. An artifact with no edges — or a store without a links view —
yields ``[]``. Pass a canonical ``(source, artifact_id)`` to
:func:`canonical_node_id` for a traversal's visited-set.
*node_id* is an intra-corpus ``artifact_id`` (a ``str``); a cross-corpus
target fetched from another graph is out of contract here (it has no
edges *in this corpus*).
"""
edges = self.store.get_links(node_id)
if edge_type is not None:
targets = list(edges.get(edge_type, []))
else:
targets = [t for ts in edges.values() for t in ts]
seen: set = set()
out: list = []
for t in targets:
key = tuple(t) if isinstance(t, list) else t
if key not in seen:
seen.add(key)
out.append(t)
return out
[docs]
def edge_types(self, node_id: str) -> list[str]:
"""The edge types present on *node_id* (``[]`` if none)."""
return list(self.store.get_links(node_id))
[docs]
def canonical_node_id(target: Any, *, source: str | None) -> tuple[str | None, str]:
"""Canonicalize a neighbor *target* to a ``(source, artifact_id)`` node id.
The repo's node identity is ``(source, artifact_id)`` — the key a traversal
visited-set must use so the same id in two corpora stays two nodes.
:meth:`CorpusGraph.neighbors` returns targets in stored form: a bare
``artifact_id`` (implicitly in *source*, the graph it came from) or a
``[source, artifact_id]`` cross-corpus pair. This resolves either to the
canonical tuple.
>>> canonical_node_id("dol", source="packages")
('packages', 'dol')
>>> canonical_node_id(["skills", "deploy"], source="packages")
('skills', 'deploy')
"""
if isinstance(target, (list, tuple)) and len(target) == 2:
return (target[0], target[1])
return (source, target)
# ---- edge ingest ---------------------------------------------------------- #
#: An edge extractor: ``(artifact_id, filter_fields) -> {edge_type: [target]}``.
#: An injectable build-time seam (like :data:`~ir.formulate.Formulator`) that
#: turns an artifact's filter fields into outgoing edges. ``build(...,
#: edge_extractor=...)`` runs it; :func:`default_edge_extractor` is shipped.
EdgeExtractor = Callable[[str, Mapping[str, Any]], "Mapping[str, list]"]
#: Split a PEP 508 dependency spec at the first version/extra/marker/url
#: delimiter, leaving the bare distribution name (``"numpy[fast]>=1.2;
#: python>='3.9'"`` → ``"numpy"``; the direct-reference ``"dol@git+https://…"``
#: → ``"dol"``). ``@`` is included so PEP 508 URL deps yield the bare name.
_DEP_DELIM = re.compile(r"[<>=!~;@\[\s(]")
def _dep_name(dep: str) -> str:
"""The bare distribution name of a dependency spec (lower-cased, stripped)."""
return _DEP_DELIM.split(dep.strip(), 1)[0].strip().lower()