# app/core/derive_edges.py # -*- coding: utf-8 -*- """ Edge-Ableitung (V2) Beibehaltung der bestehenden Funktionalität + Erweiterung: - Mehrere Inline-Referenzen in einer Zeile: rel: [[A]] [[B]] ... Kompatibel mit: - Strukturkanten: belongs_to / next / prev - Explizite Wikilinks -> references - Inline-Relationen -> inline:rel - Callout-Kanten -> callout:edge - Typbasierte Default-Kanten (edge_defaults aus types.yaml) """ from __future__ import annotations import re from typing import Dict, List, Iterable, Tuple, Set # ---------------------------------------------------------------------- # Regex-Bausteine # ---------------------------------------------------------------------- # Wikilinks: [[Title]] oder [[Title|Alias]] RE_WIKILINK = re.compile(r"\[\[([^\]|#]+)(?:#[^\]|]+)?(?:\|[^\]]+)?\]\]") # Inline-Relationen (Variante B – von dir im Einsatz): # rel: [[Target A]] [[Target B]] ... RE_INLINE_REL_LINE = re.compile( r"(?i)\brel\s*:\s*(?P[a-z_][a-z0-9_]+)\s+(?P.+)$" ) # Callout: # > [!edge] : [[A]] [[B]] RE_CALLOUT_HEADER = re.compile(r"^\s{0,3}>\s*\[\!edge\]\s*(?P[a-z_][a-z0-9_]+)\s*:\s*(?P.*)$", re.IGNORECASE) # ---------------------------------------------------------------------- # Utilities # ---------------------------------------------------------------------- def _neighbors_chain(chunk_ids: List[str]) -> Iterable[Tuple[str, str]]: """Erzeugt (prev, next) Paare entlang der Chunk-Sequenz.""" for i in range(len(chunk_ids) - 1): yield chunk_ids[i], chunk_ids[i + 1] def _mk_edge_payload( *, kind: str, scope: str, note_id: str, chunk_id: str | None = None, source_id: str, target_id: str, rule_id: str, confidence: float, ) -> Dict: """ Einheitliches Edge-Payload-Format. """ pl = { "kind": kind, # z.B. references, depends_on, related_to, similar_to "scope": scope, # "chunk" oder "note" "note_id": note_id, # Note-Kontext (Quelle) "source_id": source_id, # id der Quelle (Chunk-ID oder Note-ID) "target_id": target_id, # Ziel (Note-ID oder Titel, falls Auflösung extern erfolgt) "rule_id": rule_id, "confidence": confidence, } if chunk_id: pl["chunk_id"] = chunk_id return pl def _extract_wikilinks(text: str) -> List[str]: """ Extrahiert alle Wikilink-Ziele (als Titel-Strings). """ return [m.group(1).strip() for m in RE_WIKILINK.finditer(text or "")] def _extract_inline_relations_lines(text: str) -> List[Tuple[str, List[str]]]: """ Findet Inline-Relationen in Zeilen wie: rel: [[Target A]] [[Target B]] Liefert Liste von (relation, [targets...]). """ out: List[Tuple[str, List[str]]] = [] if not text: return out for line in text.splitlines(): m = RE_INLINE_REL_LINE.search(line) if not m: continue rel = m.group("rel").strip().lower() body = m.group("body") # alle [[...]] Ziele aus body herausziehen: targets = _extract_wikilinks(body) # falls im Body keine [[...]] vorkommen, versuche verbleibenden Text als ein Ziel (robust): if not targets: cleaned = body.strip() if cleaned: targets = [cleaned] if targets: out.append((rel, targets)) return out def _extract_callout_edges(text: str) -> List[Tuple[str, List[str]]]: """ Callout-Edges: > [!edge] : [[A]] [[B]] pro Zeile eine Relation + 1..n Ziele """ out: List[Tuple[str, List[str]]] = [] if not text: return out for line in text.splitlines(): m = RE_CALLOUT_HEADER.match(line) if not m: continue rel = m.group("rel").strip().lower() body = m.group("body") targets = _extract_wikilinks(body) # Robustheit: wenn keine [[...]] vorhanden, restlicher body als ein Ziel if not targets: cleaned = body.strip() if cleaned: targets = [cleaned] if targets: out.append((rel, targets)) return out # ---------------------------------------------------------------------- # Haupt-API # ---------------------------------------------------------------------- def derive_edges( note: Dict, chunks: List[Dict], types_cfg: Dict | None = None, ) -> List[Dict]: """ Leitet Kanten für eine Note ab. Erwartete Felder: note: { "note_id": str, "title": str, "type": str, "text": str } chunks: [{ "chunk_id": str, "index": int, "text": str, ... }, ...] types_cfg (aus types.yaml geladen) mit: types_cfg["types"][]["edge_defaults"] = [relation, ...] (optional) """ edges: List[Dict] = [] note_id = note.get("note_id") or note.get("id") note_title = note.get("title") or "" note_type = (note.get("type") or "").strip().lower() note_text = note.get("text") or "" # ------------------------------------------------------------------ # 1) Strukturkanten je Chunk: belongs_to / next / prev # ------------------------------------------------------------------ chunk_ids = [c.get("chunk_id") for c in chunks if c.get("chunk_id")] # belongs_to for c in chunks: cid = c.get("chunk_id") if not cid: continue edges.append( _mk_edge_payload( kind="belongs_to", scope="chunk", note_id=note_id, chunk_id=cid, source_id=cid, target_id=note_id, rule_id="structure:belongs_to", confidence=1.0, ) ) # next/prev for prev_id, next_id in _neighbors_chain(chunk_ids): # next edges.append( _mk_edge_payload( kind="next", scope="chunk", note_id=note_id, chunk_id=prev_id, source_id=prev_id, target_id=next_id, rule_id="structure:next", confidence=1.0, ) ) # prev edges.append( _mk_edge_payload( kind="prev", scope="chunk", note_id=note_id, chunk_id=next_id, source_id=next_id, target_id=prev_id, rule_id="structure:prev", confidence=1.0, ) ) # ------------------------------------------------------------------ # 2) Explizite Referenzen (Wikilinks) + Inline-Relationen + Callouts # - Alles chunk-scope, Quelle = chunk_id (falls vorhanden), # sonst Note-scope als Fallback. # ------------------------------------------------------------------ # Sammle alle expliziten Ziele (für spätere edge_defaults) explicit_targets: Set[str] = set() # pro Chunk prüfen for c in chunks: cid = c.get("chunk_id") ctxt = c.get("text") or "" # 2a) Wikilinks -> references for tgt in _extract_wikilinks(ctxt): explicit_targets.add(tgt) edges.append( _mk_edge_payload( kind="references", scope="chunk", note_id=note_id, chunk_id=cid, source_id=cid, target_id=tgt, rule_id="explicit:wikilink", confidence=1.0, ) ) # 2b) Inline-Relationen (mehrere Ziele erlaubt) for rel, targets in _extract_inline_relations_lines(ctxt): for tgt in targets: explicit_targets.add(tgt) edges.append( _mk_edge_payload( kind=rel, scope="chunk", note_id=note_id, chunk_id=cid, source_id=cid, target_id=tgt, rule_id="inline:rel", confidence=0.95, ) ) # 2c) Callout-Edges (mehrere Ziele erlaubt) for rel, targets in _extract_callout_edges(ctxt): for tgt in targets: explicit_targets.add(tgt) edges.append( _mk_edge_payload( kind=rel, scope="chunk", note_id=note_id, chunk_id=cid, source_id=cid, target_id=tgt, rule_id="callout:edge", confidence=0.9, ) ) # Fallback: Falls Note keinen Chunk-Text enthielt (theoretisch), # prüfe Note-Text einmal global (liefert note-scope Kanten). if not chunks and note_text: # Wikilinks for tgt in _extract_wikilinks(note_text): explicit_targets.add(tgt) edges.append( _mk_edge_payload( kind="references", scope="note", note_id=note_id, source_id=note_id, target_id=tgt, rule_id="explicit:wikilink", confidence=1.0, ) ) # Inline for rel, targets in _extract_inline_relations_lines(note_text): for tgt in targets: explicit_targets.add(tgt) edges.append( _mk_edge_payload( kind=rel, scope="note", note_id=note_id, source_id=note_id, target_id=tgt, rule_id="inline:rel", confidence=0.95, ) ) # Callouts for rel, targets in _extract_callout_edges(note_text): for tgt in targets: explicit_targets.add(tgt) edges.append( _mk_edge_payload( kind=rel, scope="note", note_id=note_id, source_id=note_id, target_id=tgt, rule_id="callout:edge", confidence=0.9, ) ) # ------------------------------------------------------------------ # 3) Typbasierte Default-Kanten (edge_defaults) # - nur, wenn es explizite Ziele gibt (sonst kein Ableitungsanker) # ------------------------------------------------------------------ if types_cfg and explicit_targets: type_entry = (types_cfg.get("types") or {}).get(note_type) or {} defaults: List[str] = type_entry.get("edge_defaults") or [] defaults = [str(d).strip().lower() for d in defaults if str(d).strip()] if defaults: # default-Kanten als "note"-Scope (Konzeption: vom Note-Kontext aus) for rel in defaults: rule = f"edge_defaults:{note_type}:{rel}" for tgt in sorted(explicit_targets): edges.append( _mk_edge_payload( kind=rel, scope="note", note_id=note_id, source_id=note_id, target_id=tgt, rule_id=rule, confidence=0.7, ) ) # ------------------------------------------------------------------ # 4) De-Duplizierung (idempotent): Schlüssel (kind, scope, source_id, target_id, rule_id) # ------------------------------------------------------------------ seen: Set[Tuple[str, str, str, str, str]] = set() uniq: List[Dict] = [] for e in edges: key = (e["kind"], e["scope"], e["source_id"], e["target_id"], e["rule_id"]) if key in seen: continue seen.add(key) uniq.append(e) return uniq