From 4c56918d8ad69d4206166cb8d81b3debd3cf74fe Mon Sep 17 00:00:00 2001 From: Lars Date: Mon, 17 Nov 2025 13:06:29 +0100 Subject: [PATCH] app/core/derive_edges.py aktualisiert --- app/core/derive_edges.py | 118 +++++++++++++++++++++++++++++++++++---- 1 file changed, 108 insertions(+), 10 deletions(-) diff --git a/app/core/derive_edges.py b/app/core/derive_edges.py index e9126c8..0465cb4 100644 --- a/app/core/derive_edges.py +++ b/app/core/derive_edges.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ Modul: app/core/derive_edges.py -Version: 2.1.0 (V2-superset mit "typed inline relations") +Version: 2.2.0 (V2-superset mit "typed inline relations" + Obsidian-Callouts) Zweck ----- @@ -12,7 +12,9 @@ und ergänzt: - **Explizite, getypte Inline-Relationen** direkt im Chunk-Text: * [[rel:depends_on | Target Title]] * [[rel:related_to Target Title]] - (beides wird erkannt; Groß-/Kleinschreibung egal) +- **Obsidian-Callouts** zur Pflege von Kanten im Markdown: + * > [!edge] related_to: [[Vector DB Basics]] + * Mehrere Zeilen im Callout werden unterstützt (alle Zeilen beginnen mit '>'). Konfiguration ------------- @@ -26,7 +28,7 @@ from __future__ import annotations import os import re -from typing import Dict, Iterable, List, Optional, Tuple, Set +from typing import Iterable, List, Optional, Tuple, Set try: import yaml # optional, nur für types.yaml @@ -63,7 +65,7 @@ def _dedupe_seq(seq: Iterable[str]) -> List[str]: def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: pl = { "kind": kind, - "relation": kind, # v2 Feld + "relation": kind, # v2 Feld (alias) "scope": scope, # "chunk" | "note" "source_id": source_id, "target_id": target_id, @@ -87,7 +89,7 @@ def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = def _env(n: str, default: Optional[str] = None) -> str: v = os.getenv(n) - return v if v is not None else (default or "") + return v if v is not None else (default or "" ) def _load_types_registry() -> dict: """Lädt die YAML-Registry aus MINDNET_TYPES_FILE oder ./config/types.yaml""" @@ -136,8 +138,8 @@ _WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]") # Getypte Inline-Relationen: # [[rel:depends_on | Target]] # [[rel:related_to Target]] -_REL_PIPE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s*\|\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) -_REL_SPACE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s+(?P[^\]]+?)\s*\]\]", re.IGNORECASE) +_REL_PIPE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s*\|\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) +_REL_SPACE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s+(?P[^\]]+?)\s*\]\]", re.IGNORECASE) def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: """ @@ -155,6 +157,79 @@ def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: text = _REL_SPACE.sub(_collect, text) return pairs, text +# ---- Obsidian Callout Parser ---------------------------------------------- +# Callout-Start erkennt Zeilen wie: > [!edge] ... (case-insensitive) +_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE) + +# Innerhalb von Callouts erwarten wir je Zeile Muster wie: +# related_to: [[Vector DB Basics]] +# depends_on: [[A]], [[B]] +# similar_to: Qdrant Vektordatenbank +_REL_LINE = re.compile(r"^(?P[a-z_]+)\s*:\s*(?P.+?)\s*$", re.IGNORECASE) + +_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]") + +def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: + """ + Findet Obsidian-Callouts vom Typ [!edge] und extrahiert (kind, target). + Entfernt den gesamten Callout-Block aus dem Text, damit Wikilinks daraus + nicht zusätzlich als "references" gezählt werden. + """ + if not text: + return [], text + + lines = text.splitlines() + out_pairs: List[Tuple[str,str]] = [] + keep_lines: List[str] = [] + + i = 0 + while i < len(lines): + m = _CALLOUT_START.match(lines[i]) + if not m: + keep_lines.append(lines[i]) + i += 1 + continue + + # Wir sind in einem Callout-Block; erste Zeile nach dem Marker: + # Rest dieser Zeile nach [!edge] mitnehmen + block_lines: List[str] = [] + first_rest = m.group(1) or "" + if first_rest.strip(): + block_lines.append(first_rest) + + # Folgezeilen sind Teil des Callouts, solange sie weiterhin mit '>' beginnen + i += 1 + while i < len(lines) and lines[i].lstrip().startswith('>'): + # Entferne führendes '>' und evtl. Leerzeichen + block_lines.append(lines[i].lstrip()[1:].lstrip()) + i += 1 + + # Parse jede Blockzeile eigenständig + for bl in block_lines: + mrel = _REL_LINE.match(bl) + if not mrel: + continue + kind = (mrel.group("kind") or "").strip().lower() + targets = mrel.group("targets") or "" + # Wikilinks bevorzugt + found = _WIKILINKS_IN_LINE.findall(targets) + if found: + for t in found: + t = t.strip() + if t: + out_pairs.append((kind, t)) + else: + # Fallback: Split per ',' oder ';' + for raw in re.split(r"[,;]", targets): + t = raw.strip() + if t: + out_pairs.append((kind, t)) + # Wichtig: Callout wird NICHT in keep_lines übernommen (entfernt) + continue + + remainder = "\n".join(keep_lines) + return out_pairs, remainder + def _extract_wikilinks(text: str) -> List[str]: ids: List[str] = [] for m in _WIKILINK_RE.finditer(text or ""): @@ -176,6 +251,7 @@ def build_edges_for_note( - next / prev: zwischen aufeinanderfolgenden Chunks - references: pro Chunk aus window/text (via Wikilinks) - typed inline relations: [[rel:KIND | Target]] oder [[rel:KIND Target]] + - Obsidian Callouts: > [!edge] KIND: [[Target]] - optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references - typenbasierte Default-Kanten (edge_defaults) je gefundener Referenz """ @@ -221,7 +297,7 @@ def build_edges_for_note( "confidence": 0.95, })) - # --- 3) references (chunk-scope) + typed inline relations + abgeleitete Relationen je Ref --- + # --- 3) references (chunk-scope) + inline relations + callouts + abgeleitete Relationen je Ref --- reg = _load_types_registry() defaults = _edge_defaults_for(note_type, reg) refs_all: List[str] = [] @@ -252,8 +328,30 @@ def build_edges_for_note( "confidence": 0.95, })) - # b) generische Wikilinks (remainder) → "references" - refs = _extract_wikilinks(remainder) + # b) Obsidian Callouts extrahieren (und aus remainder entfernen) + call_pairs, remainder2 = _extract_callout_relations(remainder) + for kind, target in call_pairs: + k = (kind or "").strip().lower() + if not k or not target: + continue + edges.append(_edge(k, "chunk", cid, target, note_id, { + "chunk_id": cid, + "edge_id": _mk_edge_id(k, cid, target, "chunk", "callout:edge:v1"), + "provenance": "explicit", + "rule_id": "callout:edge:v1", + "confidence": 0.95, + })) + if k in {"related_to", "similar_to"}: + edges.append(_edge(k, "chunk", target, cid, note_id, { + "chunk_id": cid, + "edge_id": _mk_edge_id(k, target, cid, "chunk", "callout:edge:v1"), + "provenance": "explicit", + "rule_id": "callout:edge:v1", + "confidence": 0.95, + })) + + # c) generische Wikilinks (remainder2) → "references" + refs = _extract_wikilinks(remainder2) for r in refs: # reale Referenz (wie bisher) edges.append(_edge("references", "chunk", cid, r, note_id, {