mindnet/app/core/derive_edges.py
Lars bd997e61d6
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
app/core/derive_edges.py aktualisiert
2025-11-17 15:46:56 +01:00

237 lines
7.4 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
import re
from typing import Dict, Iterable, List, Optional, Tuple
# ------------------------------
# Edge payload helper
# ------------------------------
def _edge_payload(
*,
note_id: str,
chunk_id: Optional[str],
kind: str,
source_id: str,
target_id: str,
rule_id: str,
scope: str = "chunk",
confidence: Optional[float] = None,
) -> Dict:
p = {
"note_id": note_id,
"chunk_id": chunk_id,
"kind": kind,
"scope": scope,
"source_id": source_id,
"target_id": target_id,
"rule_id": rule_id,
}
if confidence is not None:
p["confidence"] = float(confidence)
return p
# ------------------------------
# Inline [[wikilink]] parser
# ------------------------------
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
def _iter_wikilinks(text: str) -> Iterable[str]:
for m in _WIKILINK_RE.finditer(text):
yield m.group(1).strip()
# ------------------------------
# Callout parser
# Syntax:
# > [!edge] related_to: [[Vector DB Basics]] [[Embeddings 101]]
# Mehrere Ziele pro Zeile erlaubt.
# ------------------------------
_CALLOUT_RE = re.compile(
r"^\s*>\s*\[!edge\]\s*([a-z_]+)\s*:\s*(.+)$",
flags=re.IGNORECASE,
)
def _parse_callout_line(line: str) -> Optional[Tuple[str, List[str]]]:
m = _CALLOUT_RE.match(line)
if not m:
return None
relation = m.group(1).strip().lower()
rhs = m.group(2)
targets = [t.strip() for t in _WIKILINK_RE.findall(rhs) if t.strip()]
if not targets:
return None
return (relation, targets)
# ------------------------------
# Defaults aus types.yaml anwenden (wenn konfiguriert)
# types_cfg Beispiel:
# { "types": { "project": { "edge_defaults": ["references","depends_on"] }, ... } }
# ------------------------------
def _edge_defaults_for_type(types_cfg: Dict, note_type: str) -> List[str]:
tdef = (types_cfg or {}).get("types", {}).get(note_type, {})
vals = tdef.get("edge_defaults") or []
return [str(v).strip().lower() for v in vals if str(v).strip()]
# ------------------------------
# Hauptfunktion: Edges ableiten
# Erwartete Inputs:
# note: { "note_id","title","type","text", ... }
# chunks: [ { "chunk_id","note_id","index","ord","text","window", ... }, ... ]
# types_cfg: geladene types.yaml als Dict
# ------------------------------
def derive_edges(
note: Dict,
chunks: List[Dict],
types_cfg: Optional[Dict] = None,
) -> List[Dict]:
note_id = note.get("note_id") or note.get("id")
note_title = note.get("title") or ""
note_type = (note.get("type") or "").strip().lower()
text = note.get("text") or ""
edges: List[Dict] = []
# 1) Sequenz-Edges je Note: belongs_to / next / prev
for i, ch in enumerate(chunks):
cid = ch.get("chunk_id")
# belongs_to
edges.append(
_edge_payload(
note_id=note_id,
chunk_id=cid,
kind="belongs_to",
source_id=cid,
target_id=note_id,
rule_id="structure:v1:belongs_to",
scope="chunk",
)
)
# next/prev
if i + 1 < len(chunks):
nxt = chunks[i + 1]["chunk_id"]
edges.append(
_edge_payload(
note_id=note_id,
chunk_id=cid,
kind="next",
source_id=cid,
target_id=nxt,
rule_id="structure:v1:next",
scope="chunk",
)
)
if i - 1 >= 0:
prv = chunks[i - 1]["chunk_id"]
edges.append(
_edge_payload(
note_id=note_id,
chunk_id=cid,
kind="prev",
source_id=cid,
target_id=prv,
rule_id="structure:v1:prev",
scope="chunk",
)
)
# 2) Inline-Wikilinks ([[Title]]) => references (note-scope + chunk-scope)
# - chunk-scope: pro Chunk in dessen Text/Window
# - note-scope: Gesamttext der Note
# Hinweis: target_id wird hier als Titel gespeichert; später kann ein Resolver auf note_id mappen.
# chunk-scope
for ch in chunks:
cid = ch.get("chunk_id")
body = (ch.get("window") or ch.get("text") or "")
touched = False
for tgt in _iter_wikilinks(body):
touched = True
edges.append(
_edge_payload(
note_id=note_id,
chunk_id=cid,
kind="references",
source_id=cid,
target_id=tgt, # Titel
rule_id="inline:rel:v1:references",
scope="chunk",
confidence=0.8,
)
)
_ = touched
# note-scope (Gesamttext)
for tgt in _iter_wikilinks(text):
edges.append(
_edge_payload(
note_id=note_id,
chunk_id=None,
kind="references",
source_id=note_id,
target_id=tgt, # Titel
rule_id="explicit:ref:v1:wikilink",
scope="note",
confidence=0.8,
)
)
# 3) Callouts:
# > [!edge] related_to: [[A]] [[B]]
# ⇒ pro Ziel A/B je ein Edge mit rule_id="callout:edge:v1:<relation>"
for ch in chunks:
cid = ch.get("chunk_id")
body = (ch.get("window") or ch.get("text") or "")
for line in body.splitlines():
parsed = _parse_callout_line(line)
if not parsed:
continue
relation, targets = parsed
relation = relation.lower()
rule_tag = f"callout:edge:v1:{relation}"
for tgt in targets:
edges.append(
_edge_payload(
note_id=note_id,
chunk_id=cid,
kind=relation,
source_id=cid,
target_id=tgt, # Titel
rule_id=rule_tag,
scope="chunk",
confidence=0.7,
)
)
# 4) Ableitungs-Edges (edge_defaults) aus types.yaml
# Beispiel: project -> ["references","depends_on"]
defaults = _edge_defaults_for_type(types_cfg or {}, note_type)
if defaults:
rule_prefix = f"edge_defaults:{note_type}"
for ch in chunks:
cid = ch.get("chunk_id")
for rel in defaults:
edges.append(
_edge_payload(
note_id=note_id,
chunk_id=cid,
kind=rel,
source_id=cid,
target_id=note_title or note_id, # weiche Zielmarke
rule_id=f"{rule_prefix}:{rel}",
scope="chunk",
confidence=0.7,
)
)
# 5) De-Duplizierung (idempotent): key = (source_id, target_id, kind, rule_id)
unique: Dict[Tuple[str, str, str, str], Dict] = {}
for e in edges:
k = (e["source_id"], e["target_id"], e["kind"], e["rule_id"])
unique[k] = e
return list(unique.values())