mindnet/app/core/derive_edges.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations

import re
from typing import Dict, Iterable, List, Optional, Tuple


# ------------------------------
# Edge payload helper
# ------------------------------
def _edge_payload(
    *,
    note_id: str,
    chunk_id: Optional[str],
    kind: str,
    source_id: str,
    target_id: str,
    rule_id: str,
    scope: str = "chunk",
    confidence: Optional[float] = None,
) -> Dict:
    p = {
        "note_id": note_id,
        "chunk_id": chunk_id,
        "kind": kind,
        "scope": scope,
        "source_id": source_id,
        "target_id": target_id,
        "rule_id": rule_id,
    }
    if confidence is not None:
        p["confidence"] = float(confidence)
    return p


# ------------------------------
# Inline [[wikilink]] parser
# ------------------------------
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")

def _iter_wikilinks(text: str) -> Iterable[str]:
    for m in _WIKILINK_RE.finditer(text):
        yield m.group(1).strip()


# ------------------------------
# Callout parser
#   Syntax:
#     > [!edge] related_to: [[Vector DB Basics]] [[Embeddings 101]]
#   Mehrere Ziele pro Zeile erlaubt.
# ------------------------------
_CALLOUT_RE = re.compile(
    r"^\s*>\s*\[!edge\]\s*([a-z_]+)\s*:\s*(.+)$",
    flags=re.IGNORECASE,
)

def _parse_callout_line(line: str) -> Optional[Tuple[str, List[str]]]:
    m = _CALLOUT_RE.match(line)
    if not m:
        return None
    relation = m.group(1).strip().lower()
    rhs = m.group(2)
    targets = [t.strip() for t in _WIKILINK_RE.findall(rhs) if t.strip()]
    if not targets:
        return None
    return (relation, targets)


# ------------------------------
# Defaults aus types.yaml anwenden (wenn konfiguriert)
#   types_cfg Beispiel:
#     { "types": { "project": { "edge_defaults": ["references","depends_on"] }, ... } }
# ------------------------------
def _edge_defaults_for_type(types_cfg: Dict, note_type: str) -> List[str]:
    tdef = (types_cfg or {}).get("types", {}).get(note_type, {})
    vals = tdef.get("edge_defaults") or []
    return [str(v).strip().lower() for v in vals if str(v).strip()]


# ------------------------------
# Hauptfunktion: Edges ableiten
# Erwartete Inputs:
#   note: { "note_id","title","type","text", ... }
#   chunks: [ { "chunk_id","note_id","index","ord","text","window", ... }, ... ]
#   types_cfg: geladene types.yaml als Dict
# ------------------------------
def derive_edges(
    note: Dict,
    chunks: List[Dict],
    types_cfg: Optional[Dict] = None,
) -> List[Dict]:
    note_id = note.get("note_id") or note.get("id")
    note_title = note.get("title") or ""
    note_type = (note.get("type") or "").strip().lower()
    text = note.get("text") or ""

    edges: List[Dict] = []

    # 1) Sequenz-Edges je Note: belongs_to / next / prev
    for i, ch in enumerate(chunks):
        cid = ch.get("chunk_id")
        # belongs_to
        edges.append(
            _edge_payload(
                note_id=note_id,
                chunk_id=cid,
                kind="belongs_to",
                source_id=cid,
                target_id=note_id,
                rule_id="structure:v1:belongs_to",
                scope="chunk",
            )
        )
        # next/prev
        if i + 1 < len(chunks):
            nxt = chunks[i + 1]["chunk_id"]
            edges.append(
                _edge_payload(
                    note_id=note_id,
                    chunk_id=cid,
                    kind="next",
                    source_id=cid,
                    target_id=nxt,
                    rule_id="structure:v1:next",
                    scope="chunk",
                )
            )
        if i - 1 >= 0:
            prv = chunks[i - 1]["chunk_id"]
            edges.append(
                _edge_payload(
                    note_id=note_id,
                    chunk_id=cid,
                    kind="prev",
                    source_id=cid,
                    target_id=prv,
                    rule_id="structure:v1:prev",
                    scope="chunk",
                )
            )

    # 2) Inline-Wikilinks ([[Title]]) => references (note-scope + chunk-scope)
    #    - chunk-scope: pro Chunk in dessen Text/Window
    #    - note-scope:  Gesamttext der Note
    # Hinweis: target_id wird hier als Titel gespeichert; später kann ein Resolver auf note_id mappen.
    # chunk-scope
    for ch in chunks:
        cid = ch.get("chunk_id")
        body = (ch.get("window") or ch.get("text") or "")
        touched = False
        for tgt in _iter_wikilinks(body):
            touched = True
            edges.append(
                _edge_payload(
                    note_id=note_id,
                    chunk_id=cid,
                    kind="references",
                    source_id=cid,
                    target_id=tgt,  # Titel
                    rule_id="inline:rel:v1:references",
                    scope="chunk",
                    confidence=0.8,
                )
            )
        # Optional: wenn in einem Chunk Wikilinks vorkamen, kannst du (später) einen counter o. ä. setzen.
        _ = touched

    # note-scope (Gesamttext)
    for tgt in _iter_wikilinks(text):
        edges.append(
            _edge_payload(
                note_id=note_id,
                chunk_id=None,
                kind="references",
                source_id=note_id,
                target_id=tgt,  # Titel
                rule_id="explicit:ref:v1:wikilink",
                scope="note",
                confidence=0.8,
            )
        )

    # 3) Callouts:
    #       > [!edge] related_to: [[A]] [[B]]
    #    ⇒ pro Ziel A/B je ein Edge mit rule_id="callout:edge:v1:<relation>"
    for ch in chunks:
        cid = ch.get("chunk_id")
        body = (ch.get("window") or ch.get("text") or "")
        for line in body.splitlines():
            parsed = _parse_callout_line(line)
            if not parsed:
                continue
            relation, targets = parsed
            # normalize relation name
            relation = relation.lower()
            # einheitliches Rule-Tagging für Callouts:
            rule_tag = f"callout:edge:v1:{relation}"
            for tgt in targets:
                edges.append(
                    _edge_payload(
                        note_id=note_id,
                        chunk_id=cid,
                        kind=relation,
                        source_id=cid,
                        target_id=tgt,  # Titel
                        rule_id=rule_tag,
                        scope="chunk",
                        confidence=0.7,
                    )
                )

    # 4) Ableitungs-Edges (edge_defaults) aus types.yaml
    #    Beispiel: project -> ["references","depends_on"]
    #    Für jede Chunk-Einheit eine schwach gewichtete Default-Beziehung gegen den Note-Titel,
    #    damit es als Navigationskanten funktioniert, bis ein Resolver Titeleindeutigkeit herstellt.
    defaults = _edge_defaults_for_type(types_cfg or {}, note_type)
    if defaults:
        rule_prefix = f"edge_defaults:{note_type}"
        for ch in chunks:
            cid = ch.get("chunk_id")
            for rel in defaults:
                edges.append(
                    _edge_payload(
                        note_id=note_id,
                        chunk_id=cid,
                        kind=rel,
                        source_id=cid,
                        target_id=note_title or note_id,  # weiche Zielmarke
                        rule_id=f"{rule_prefix}:{rel}",
                        scope="chunk",
                        confidence=0.7,
                    )
                )

    # 5) De-Duplizierung (idempotent): key = (source_id, target_id, kind, rule_id)
    unique: Dict[Tuple[str, str, str, str], Dict] = {}
    for e in edges:
        k = (e["source_id"], e["target_id"], e["kind"], e["rule_id"])
        unique[k] = e
    return list(unique.values())