#!/usr/bin/env python3 # -*- coding: utf-8 -*- from __future__ import annotations import re from typing import Dict, Iterable, List, Optional, Tuple # ------------------------------ # Edge payload helper # ------------------------------ def _edge_payload( *, note_id: str, chunk_id: Optional[str], kind: str, source_id: str, target_id: str, rule_id: str, scope: str = "chunk", confidence: Optional[float] = None, ) -> Dict: p = { "note_id": note_id, "chunk_id": chunk_id, "kind": kind, "scope": scope, "source_id": source_id, "target_id": target_id, "rule_id": rule_id, } if confidence is not None: p["confidence"] = float(confidence) return p # ------------------------------ # Inline [[wikilink]] parser # ------------------------------ _WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") def _iter_wikilinks(text: str) -> Iterable[str]: for m in _WIKILINK_RE.finditer(text): yield m.group(1).strip() # ------------------------------ # Callout parser # Syntax: # > [!edge] related_to: [[Vector DB Basics]] [[Embeddings 101]] # Mehrere Ziele pro Zeile erlaubt. # ------------------------------ _CALLOUT_RE = re.compile( r"^\s*>\s*\[!edge\]\s*([a-z_]+)\s*:\s*(.+)$", flags=re.IGNORECASE, ) def _parse_callout_line(line: str) -> Optional[Tuple[str, List[str]]]: m = _CALLOUT_RE.match(line) if not m: return None relation = m.group(1).strip().lower() rhs = m.group(2) targets = [t.strip() for t in _WIKILINK_RE.findall(rhs) if t.strip()] if not targets: return None return (relation, targets) # ------------------------------ # Defaults aus types.yaml anwenden (wenn konfiguriert) # types_cfg Beispiel: # { "types": { "project": { "edge_defaults": ["references","depends_on"] }, ... } } # ------------------------------ def _edge_defaults_for_type(types_cfg: Dict, note_type: str) -> List[str]: tdef = (types_cfg or {}).get("types", {}).get(note_type, {}) vals = tdef.get("edge_defaults") or [] return [str(v).strip().lower() for v in vals if str(v).strip()] # ------------------------------ # Hauptfunktion: Edges ableiten # Erwartete Inputs: # note: { "note_id","title","type","text", ... } # chunks: [ { "chunk_id","note_id","index","ord","text","window", ... }, ... ] # types_cfg: geladene types.yaml als Dict # ------------------------------ def derive_edges( note: Dict, chunks: List[Dict], types_cfg: Optional[Dict] = None, ) -> List[Dict]: note_id = note.get("note_id") or note.get("id") note_title = note.get("title") or "" note_type = (note.get("type") or "").strip().lower() text = note.get("text") or "" edges: List[Dict] = [] # 1) Sequenz-Edges je Note: belongs_to / next / prev for i, ch in enumerate(chunks): cid = ch.get("chunk_id") # belongs_to edges.append( _edge_payload( note_id=note_id, chunk_id=cid, kind="belongs_to", source_id=cid, target_id=note_id, rule_id="structure:v1:belongs_to", scope="chunk", ) ) # next/prev if i + 1 < len(chunks): nxt = chunks[i + 1]["chunk_id"] edges.append( _edge_payload( note_id=note_id, chunk_id=cid, kind="next", source_id=cid, target_id=nxt, rule_id="structure:v1:next", scope="chunk", ) ) if i - 1 >= 0: prv = chunks[i - 1]["chunk_id"] edges.append( _edge_payload( note_id=note_id, chunk_id=cid, kind="prev", source_id=cid, target_id=prv, rule_id="structure:v1:prev", scope="chunk", ) ) # 2) Inline-Wikilinks ([[Title]]) => references (note-scope + chunk-scope) # - chunk-scope: pro Chunk in dessen Text/Window # - note-scope: Gesamttext der Note # Hinweis: target_id wird hier als Titel gespeichert; später kann ein Resolver auf note_id mappen. # chunk-scope for ch in chunks: cid = ch.get("chunk_id") body = (ch.get("window") or ch.get("text") or "") touched = False for tgt in _iter_wikilinks(body): touched = True edges.append( _edge_payload( note_id=note_id, chunk_id=cid, kind="references", source_id=cid, target_id=tgt, # Titel rule_id="inline:rel:v1:references", scope="chunk", confidence=0.8, ) ) # Optional: wenn in einem Chunk Wikilinks vorkamen, kannst du (später) einen counter o. ä. setzen. _ = touched # note-scope (Gesamttext) for tgt in _iter_wikilinks(text): edges.append( _edge_payload( note_id=note_id, chunk_id=None, kind="references", source_id=note_id, target_id=tgt, # Titel rule_id="explicit:ref:v1:wikilink", scope="note", confidence=0.8, ) ) # 3) Callouts: # > [!edge] related_to: [[A]] [[B]] # ⇒ pro Ziel A/B je ein Edge mit rule_id="callout:edge:v1:" for ch in chunks: cid = ch.get("chunk_id") body = (ch.get("window") or ch.get("text") or "") for line in body.splitlines(): parsed = _parse_callout_line(line) if not parsed: continue relation, targets = parsed # normalize relation name relation = relation.lower() # einheitliches Rule-Tagging für Callouts: rule_tag = f"callout:edge:v1:{relation}" for tgt in targets: edges.append( _edge_payload( note_id=note_id, chunk_id=cid, kind=relation, source_id=cid, target_id=tgt, # Titel rule_id=rule_tag, scope="chunk", confidence=0.7, ) ) # 4) Ableitungs-Edges (edge_defaults) aus types.yaml # Beispiel: project -> ["references","depends_on"] # Für jede Chunk-Einheit eine schwach gewichtete Default-Beziehung gegen den Note-Titel, # damit es als Navigationskanten funktioniert, bis ein Resolver Titeleindeutigkeit herstellt. defaults = _edge_defaults_for_type(types_cfg or {}, note_type) if defaults: rule_prefix = f"edge_defaults:{note_type}" for ch in chunks: cid = ch.get("chunk_id") for rel in defaults: edges.append( _edge_payload( note_id=note_id, chunk_id=cid, kind=rel, source_id=cid, target_id=note_title or note_id, # weiche Zielmarke rule_id=f"{rule_prefix}:{rel}", scope="chunk", confidence=0.7, ) ) # 5) De-Duplizierung (idempotent): key = (source_id, target_id, kind, rule_id) unique: Dict[Tuple[str, str, str, str], Dict] = {} for e in edges: k = (e["source_id"], e["target_id"], e["kind"], e["rule_id"]) unique[k] = e return list(unique.values())