mindnet/app/core/derive_edges.py
Lars e03dd66051
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 5s
app/core/derive_edges.py aktualisiert
2025-11-17 16:14:57 +01:00

279 lines
9.0 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
import re
from typing import Dict, List, Tuple, Iterable, Set
# --------------------------------------------
# Hilfsfunktionen
# --------------------------------------------
WIKILINK_RE = re.compile(r"\[\[([^\]]+?)\]\]")
# Inline-Relationen:
# [[rel:depends_on | Target]] oder [[rel:related_to Target]]
INLINE_REL_RE = re.compile(
r"""\[\[\s*rel\s*:\s*([a-zA-Z_][\w\-]*)\s*(?:\|\s*([^\]]+?)|(\s+[^\]]+?))\s*\]\]"""
)
# Callout-Zeilen:
# > [!edge] related_to: [[A]] [[B]]
# erlaubt flexible Whitespaces/Case, Relation-Token aus [a-zA-Z_][\w-]*
CALLOUT_LINE_RE = re.compile(
r"""^\s*>\s*\[\s*!edge\s*\]\s*([a-zA-Z_][\w\-]*)\s*:\s*(.+?)\s*$""",
re.IGNORECASE,
)
def _chunk_text(payload: Dict) -> str:
# bevorzugt 'text', sonst 'window', sonst leer
return payload.get("text") or payload.get("window") or ""
def _make_edge(
*,
note_id: str,
chunk_id: str | None,
source_id: str,
target_id: str,
relation: str,
rule_id: str,
scope: str = "chunk",
confidence: float | None = None,
) -> Dict:
pl = {
"note_id": note_id,
"chunk_id": chunk_id if chunk_id else None,
"scope": scope,
"kind": relation, # für Backward-Kompatibilität
"relation": relation, # für Auswerteskripte
"source_id": source_id,
"target_id": target_id,
"rule_id": rule_id,
}
if confidence is not None:
pl["confidence"] = confidence
return pl
def _dedup(edges: Iterable[Dict]) -> List[Dict]:
seen: Set[Tuple[str, str, str, str]] = set()
out: List[Dict] = []
for e in edges:
key = (
str(e.get("source_id") or ""),
str(e.get("target_id") or ""),
str(e.get("relation") or e.get("kind") or ""),
str(e.get("rule_id") or ""),
)
if key in seen:
continue
seen.add(key)
out.append(e)
return out
def _wikilink_targets(text: str) -> List[str]:
return [m.group(1).strip() for m in WIKILINK_RE.finditer(text)]
def _inline_relations(text: str) -> List[Tuple[str, str]]:
"""
Liefert Liste (relation, target).
Erlaubt beide Schreibweisen:
[[rel:depends_on | Target]]
[[rel:depends_on Target]]
"""
out: List[Tuple[str, str]] = []
for m in INLINE_REL_RE.finditer(text):
rel = m.group(1).strip().lower()
tgt = (m.group(2) or m.group(3) or "").strip()
if tgt.startswith("|"):
tgt = tgt[1:].strip()
if tgt:
out.append((rel, tgt))
return out
def _callout_relations(lines: List[str]) -> List[Tuple[str, List[str]]]:
"""
Sucht Zeilen wie:
> [!edge] related_to: [[A]] [[B]]
Gibt Liste (relation, [targets...]) zurück.
"""
out: List[Tuple[str, List[str]]] = []
for ln in lines:
m = CALLOUT_LINE_RE.match(ln)
if not m:
continue
rel = m.group(1).strip().lower()
tail = m.group(2)
targets = _wikilink_targets(tail)
if targets:
out.append((rel, targets))
return out
# --------------------------------------------
# Öffentliche Hauptfunktion
# --------------------------------------------
def derive_edges(note_core: Dict, chunks: List[Dict], types_cfg: Dict | None = None) -> List[Dict]:
"""
note_core: {"note_id","title","type","text"}
chunks: Liste von Chunk-Payloads (enthält 'chunk_id','index','text'/'window')
types_cfg: geladene types.yaml (dict)
Erzeugt:
- strukturelle Edges: belongs_to, next, prev
- reale Referenzen: Wikilinks -> references
- Inline-Relationen: [[rel:depends_on | Target]] -> depends_on
- Callouts: > [!edge] related_to: [[A]] [[B]] -> related_to
- Typ-Defaults: types.yaml edge_defaults -> relationen zwischen Chunk und bekannten Zielen
"""
nid = note_core.get("note_id")
ntype = (note_core.get("type") or "").strip().lower()
ntext = note_core.get("text") or ""
lines = ntext.splitlines()
edges: List[Dict] = []
# -------------------------------------------------
# 1) Strukturelle Edges je Chunk
# -------------------------------------------------
for i, ch in enumerate(chunks):
cid = ch.get("chunk_id")
edges.append(
_make_edge(
note_id=nid,
chunk_id=cid,
source_id=cid,
target_id=nid,
relation="belongs_to",
rule_id="struct:belongs_to",
confidence=1.0,
)
)
if i + 1 < len(chunks):
nxt = chunks[i + 1].get("chunk_id")
edges.append(
_make_edge(
note_id=nid,
chunk_id=cid,
source_id=cid,
target_id=nxt,
relation="next",
rule_id="struct:next",
confidence=0.99,
)
)
edges.append(
_make_edge(
note_id=nid,
chunk_id=nxt,
source_id=nxt,
target_id=cid,
relation="prev",
rule_id="struct:prev",
confidence=0.99,
)
)
# -------------------------------------------------
# 2) Reale Referenzen aus jedem Chunk-Text (Wikilinks)
# -------------------------------------------------
all_explicit_targets: Set[str] = set()
for ch in chunks:
cid = ch.get("chunk_id")
txt = _chunk_text(ch)
for tgt in _wikilink_targets(txt):
all_explicit_targets.add(tgt)
edges.append(
_make_edge(
note_id=nid,
chunk_id=cid,
source_id=cid,
target_id=tgt,
relation="references",
rule_id="explicit:wikilink",
confidence=0.9,
)
)
# -------------------------------------------------
# 3) Inline-Relationen (getypte Kanten im Text)
# -------------------------------------------------
for ch in chunks:
cid = ch.get("chunk_id")
txt = _chunk_text(ch)
for rel, tgt in _inline_relations(txt):
all_explicit_targets.add(tgt)
edges.append(
_make_edge(
note_id=nid,
chunk_id=cid,
source_id=cid,
target_id=tgt,
relation=rel,
rule_id=f"inline:rel:v1:{rel}",
confidence=0.8,
)
)
# -------------------------------------------------
# 4) Callout-Relationen (> [!edge] related_to: [[A]] [[B]])
# - Auf Note-Ebene definiert, aber wir hängen sie an den
# ersten Chunk (falls vorhanden) an, damit scope='chunk' bleibt.
# -------------------------------------------------
callouts = _callout_relations(lines)
if callouts and chunks:
first_cid = chunks[0].get("chunk_id")
for rel, tgts in callouts:
for tgt in tgts:
all_explicit_targets.add(tgt)
edges.append(
_make_edge(
note_id=nid,
chunk_id=first_cid,
source_id=first_cid,
target_id=tgt,
relation=rel,
rule_id=f"callout:edge:v1:{rel}",
confidence=0.8,
)
)
# -------------------------------------------------
# 5) Typ-Defaults (edge_defaults) aus types.yaml
# - Wenn vorhanden, erstelle pro Chunk relationen zu allen
# im Text erkannten Zielen (Wikilinks/Inline/Callout).
# -------------------------------------------------
defaults: List[str] = []
if types_cfg and isinstance(types_cfg, dict):
tdef = types_cfg.get("types", {}).get(ntype, {})
defaults = list(tdef.get("edge_defaults", []) or [])
if defaults and all_explicit_targets:
for ch in chunks:
cid = ch.get("chunk_id")
for rel in defaults:
rel_norm = str(rel).strip().lower()
if not rel_norm:
continue
for tgt in sorted(all_explicit_targets):
edges.append(
_make_edge(
note_id=nid,
chunk_id=cid,
source_id=cid,
target_id=tgt,
relation=rel_norm,
rule_id=f"edge_defaults:{ntype}:{rel_norm}",
confidence=0.7,
)
)
# -------------------------------------------------
# 6) De-Dup
# -------------------------------------------------
return _dedup(edges)