#!/usr/bin/env python3 # -*- coding: utf-8 -*- from __future__ import annotations import re from typing import Dict, List, Tuple, Iterable, Set # -------------------------------------------- # Hilfsfunktionen # -------------------------------------------- WIKILINK_RE = re.compile(r"\[\[([^\]]+?)\]\]") # Inline-Relationen: # [[rel:depends_on | Target]] oder [[rel:related_to Target]] INLINE_REL_RE = re.compile( r"""\[\[\s*rel\s*:\s*([a-zA-Z_][\w\-]*)\s*(?:\|\s*([^\]]+?)|(\s+[^\]]+?))\s*\]\]""" ) # Callout-Zeilen: # > [!edge] related_to: [[A]] [[B]] # erlaubt flexible Whitespaces/Case, Relation-Token aus [a-zA-Z_][\w-]* CALLOUT_LINE_RE = re.compile( r"""^\s*>\s*\[\s*!edge\s*\]\s*([a-zA-Z_][\w\-]*)\s*:\s*(.+?)\s*$""", re.IGNORECASE, ) def _chunk_text(payload: Dict) -> str: # bevorzugt 'text', sonst 'window', sonst leer return payload.get("text") or payload.get("window") or "" def _make_edge( *, note_id: str, chunk_id: str | None, source_id: str, target_id: str, relation: str, rule_id: str, scope: str = "chunk", confidence: float | None = None, ) -> Dict: pl = { "note_id": note_id, "chunk_id": chunk_id if chunk_id else None, "scope": scope, "kind": relation, # für Backward-Kompatibilität "relation": relation, # für Auswerteskripte "source_id": source_id, "target_id": target_id, "rule_id": rule_id, } if confidence is not None: pl["confidence"] = confidence return pl def _dedup(edges: Iterable[Dict]) -> List[Dict]: seen: Set[Tuple[str, str, str, str]] = set() out: List[Dict] = [] for e in edges: key = ( str(e.get("source_id") or ""), str(e.get("target_id") or ""), str(e.get("relation") or e.get("kind") or ""), str(e.get("rule_id") or ""), ) if key in seen: continue seen.add(key) out.append(e) return out def _wikilink_targets(text: str) -> List[str]: return [m.group(1).strip() for m in WIKILINK_RE.finditer(text)] def _inline_relations(text: str) -> List[Tuple[str, str]]: """ Liefert Liste (relation, target). Erlaubt beide Schreibweisen: [[rel:depends_on | Target]] [[rel:depends_on Target]] """ out: List[Tuple[str, str]] = [] for m in INLINE_REL_RE.finditer(text): rel = m.group(1).strip().lower() tgt = (m.group(2) or m.group(3) or "").strip() if tgt.startswith("|"): tgt = tgt[1:].strip() if tgt: out.append((rel, tgt)) return out def _callout_relations(lines: List[str]) -> List[Tuple[str, List[str]]]: """ Sucht Zeilen wie: > [!edge] related_to: [[A]] [[B]] Gibt Liste (relation, [targets...]) zurück. """ out: List[Tuple[str, List[str]]] = [] for ln in lines: m = CALLOUT_LINE_RE.match(ln) if not m: continue rel = m.group(1).strip().lower() tail = m.group(2) targets = _wikilink_targets(tail) if targets: out.append((rel, targets)) return out # -------------------------------------------- # Öffentliche Hauptfunktion # -------------------------------------------- def derive_edges(note_core: Dict, chunks: List[Dict], types_cfg: Dict | None = None) -> List[Dict]: """ note_core: {"note_id","title","type","text"} chunks: Liste von Chunk-Payloads (enthält 'chunk_id','index','text'/'window') types_cfg: geladene types.yaml (dict) Erzeugt: - strukturelle Edges: belongs_to, next, prev - reale Referenzen: Wikilinks -> references - Inline-Relationen: [[rel:depends_on | Target]] -> depends_on - Callouts: > [!edge] related_to: [[A]] [[B]] -> related_to - Typ-Defaults: types.yaml edge_defaults -> relationen zwischen Chunk und bekannten Zielen """ nid = note_core.get("note_id") ntype = (note_core.get("type") or "").strip().lower() ntext = note_core.get("text") or "" lines = ntext.splitlines() edges: List[Dict] = [] # ------------------------------------------------- # 1) Strukturelle Edges je Chunk # ------------------------------------------------- for i, ch in enumerate(chunks): cid = ch.get("chunk_id") edges.append( _make_edge( note_id=nid, chunk_id=cid, source_id=cid, target_id=nid, relation="belongs_to", rule_id="struct:belongs_to", confidence=1.0, ) ) if i + 1 < len(chunks): nxt = chunks[i + 1].get("chunk_id") edges.append( _make_edge( note_id=nid, chunk_id=cid, source_id=cid, target_id=nxt, relation="next", rule_id="struct:next", confidence=0.99, ) ) edges.append( _make_edge( note_id=nid, chunk_id=nxt, source_id=nxt, target_id=cid, relation="prev", rule_id="struct:prev", confidence=0.99, ) ) # ------------------------------------------------- # 2) Reale Referenzen aus jedem Chunk-Text (Wikilinks) # ------------------------------------------------- all_explicit_targets: Set[str] = set() for ch in chunks: cid = ch.get("chunk_id") txt = _chunk_text(ch) for tgt in _wikilink_targets(txt): all_explicit_targets.add(tgt) edges.append( _make_edge( note_id=nid, chunk_id=cid, source_id=cid, target_id=tgt, relation="references", rule_id="explicit:wikilink", confidence=0.9, ) ) # ------------------------------------------------- # 3) Inline-Relationen (getypte Kanten im Text) # ------------------------------------------------- for ch in chunks: cid = ch.get("chunk_id") txt = _chunk_text(ch) for rel, tgt in _inline_relations(txt): all_explicit_targets.add(tgt) edges.append( _make_edge( note_id=nid, chunk_id=cid, source_id=cid, target_id=tgt, relation=rel, rule_id=f"inline:rel:v1:{rel}", confidence=0.8, ) ) # ------------------------------------------------- # 4) Callout-Relationen (> [!edge] related_to: [[A]] [[B]]) # - Auf Note-Ebene definiert, aber wir hängen sie an den # ersten Chunk (falls vorhanden) an, damit scope='chunk' bleibt. # ------------------------------------------------- callouts = _callout_relations(lines) if callouts and chunks: first_cid = chunks[0].get("chunk_id") for rel, tgts in callouts: for tgt in tgts: all_explicit_targets.add(tgt) edges.append( _make_edge( note_id=nid, chunk_id=first_cid, source_id=first_cid, target_id=tgt, relation=rel, rule_id=f"callout:edge:v1:{rel}", confidence=0.8, ) ) # ------------------------------------------------- # 5) Typ-Defaults (edge_defaults) aus types.yaml # - Wenn vorhanden, erstelle pro Chunk relationen zu allen # im Text erkannten Zielen (Wikilinks/Inline/Callout). # ------------------------------------------------- defaults: List[str] = [] if types_cfg and isinstance(types_cfg, dict): tdef = types_cfg.get("types", {}).get(ntype, {}) defaults = list(tdef.get("edge_defaults", []) or []) if defaults and all_explicit_targets: for ch in chunks: cid = ch.get("chunk_id") for rel in defaults: rel_norm = str(rel).strip().lower() if not rel_norm: continue for tgt in sorted(all_explicit_targets): edges.append( _make_edge( note_id=nid, chunk_id=cid, source_id=cid, target_id=tgt, relation=rel_norm, rule_id=f"edge_defaults:{ntype}:{rel_norm}", confidence=0.7, ) ) # ------------------------------------------------- # 6) De-Dup # ------------------------------------------------- return _dedup(edges)