mindnet/app/core/derive_edges.py
Lars aab010ff17
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
app/core/derive_edges.py aktualisiert
2025-11-18 07:50:46 +01:00

365 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# app/core/derive_edges.py
# -*- coding: utf-8 -*-
"""
Edge-Ableitung (V2)
Beibehaltung der bestehenden Funktionalität + Erweiterung:
- Mehrere Inline-Referenzen in einer Zeile: rel: <relation> [[A]] [[B]] ...
Kompatibel mit:
- Strukturkanten: belongs_to / next / prev
- Explizite Wikilinks -> references
- Inline-Relationen -> inline:rel
- Callout-Kanten -> callout:edge
- Typbasierte Default-Kanten (edge_defaults aus types.yaml)
"""
from __future__ import annotations
import re
from typing import Dict, List, Iterable, Tuple, Set
# ----------------------------------------------------------------------
# Regex-Bausteine
# ----------------------------------------------------------------------
# Wikilinks: [[Title]] oder [[Title|Alias]]
RE_WIKILINK = re.compile(r"\[\[([^\]|#]+)(?:#[^\]|]+)?(?:\|[^\]]+)?\]\]")
# Inline-Relationen (Variante B von dir im Einsatz):
# rel: <relation> [[Target A]] [[Target B]] ...
RE_INLINE_REL_LINE = re.compile(
r"(?i)\brel\s*:\s*(?P<rel>[a-z_][a-z0-9_]+)\s+(?P<body>.+)$"
)
# Callout:
# > [!edge] <relation>: [[A]] [[B]]
RE_CALLOUT_HEADER = re.compile(r"^\s{0,3}>\s*\[\!edge\]\s*(?P<rel>[a-z_][a-z0-9_]+)\s*:\s*(?P<body>.*)$", re.IGNORECASE)
# ----------------------------------------------------------------------
# Utilities
# ----------------------------------------------------------------------
def _neighbors_chain(chunk_ids: List[str]) -> Iterable[Tuple[str, str]]:
"""Erzeugt (prev, next) Paare entlang der Chunk-Sequenz."""
for i in range(len(chunk_ids) - 1):
yield chunk_ids[i], chunk_ids[i + 1]
def _mk_edge_payload(
*,
kind: str,
scope: str,
note_id: str,
chunk_id: str | None = None,
source_id: str,
target_id: str,
rule_id: str,
confidence: float,
) -> Dict:
"""
Einheitliches Edge-Payload-Format.
"""
pl = {
"kind": kind, # z.B. references, depends_on, related_to, similar_to
"scope": scope, # "chunk" oder "note"
"note_id": note_id, # Note-Kontext (Quelle)
"source_id": source_id, # id der Quelle (Chunk-ID oder Note-ID)
"target_id": target_id, # Ziel (Note-ID oder Titel, falls Auflösung extern erfolgt)
"rule_id": rule_id,
"confidence": confidence,
}
if chunk_id:
pl["chunk_id"] = chunk_id
return pl
def _extract_wikilinks(text: str) -> List[str]:
"""
Extrahiert alle Wikilink-Ziele (als Titel-Strings).
"""
return [m.group(1).strip() for m in RE_WIKILINK.finditer(text or "")]
def _extract_inline_relations_lines(text: str) -> List[Tuple[str, List[str]]]:
"""
Findet Inline-Relationen in Zeilen wie:
rel: <relation> [[Target A]] [[Target B]]
Liefert Liste von (relation, [targets...]).
"""
out: List[Tuple[str, List[str]]] = []
if not text:
return out
for line in text.splitlines():
m = RE_INLINE_REL_LINE.search(line)
if not m:
continue
rel = m.group("rel").strip().lower()
body = m.group("body")
# alle [[...]] Ziele aus body herausziehen:
targets = _extract_wikilinks(body)
# falls im Body keine [[...]] vorkommen, versuche verbleibenden Text als ein Ziel (robust):
if not targets:
cleaned = body.strip()
if cleaned:
targets = [cleaned]
if targets:
out.append((rel, targets))
return out
def _extract_callout_edges(text: str) -> List[Tuple[str, List[str]]]:
"""
Callout-Edges:
> [!edge] <relation>: [[A]] [[B]]
pro Zeile eine Relation + 1..n Ziele
"""
out: List[Tuple[str, List[str]]] = []
if not text:
return out
for line in text.splitlines():
m = RE_CALLOUT_HEADER.match(line)
if not m:
continue
rel = m.group("rel").strip().lower()
body = m.group("body")
targets = _extract_wikilinks(body)
# Robustheit: wenn keine [[...]] vorhanden, restlicher body als ein Ziel
if not targets:
cleaned = body.strip()
if cleaned:
targets = [cleaned]
if targets:
out.append((rel, targets))
return out
# ----------------------------------------------------------------------
# Haupt-API
# ----------------------------------------------------------------------
def derive_edges(
note: Dict,
chunks: List[Dict],
types_cfg: Dict | None = None,
) -> List[Dict]:
"""
Leitet Kanten für eine Note ab.
Erwartete Felder:
note: {
"note_id": str,
"title": str,
"type": str,
"text": str
}
chunks: [{
"chunk_id": str,
"index": int,
"text": str,
...
}, ...]
types_cfg (aus types.yaml geladen) mit:
types_cfg["types"][<type>]["edge_defaults"] = [relation, ...]
(optional)
"""
edges: List[Dict] = []
note_id = note.get("note_id") or note.get("id")
note_title = note.get("title") or ""
note_type = (note.get("type") or "").strip().lower()
note_text = note.get("text") or ""
# ------------------------------------------------------------------
# 1) Strukturkanten je Chunk: belongs_to / next / prev
# ------------------------------------------------------------------
chunk_ids = [c.get("chunk_id") for c in chunks if c.get("chunk_id")]
# belongs_to
for c in chunks:
cid = c.get("chunk_id")
if not cid:
continue
edges.append(
_mk_edge_payload(
kind="belongs_to",
scope="chunk",
note_id=note_id,
chunk_id=cid,
source_id=cid,
target_id=note_id,
rule_id="structure:belongs_to",
confidence=1.0,
)
)
# next/prev
for prev_id, next_id in _neighbors_chain(chunk_ids):
# next
edges.append(
_mk_edge_payload(
kind="next",
scope="chunk",
note_id=note_id,
chunk_id=prev_id,
source_id=prev_id,
target_id=next_id,
rule_id="structure:next",
confidence=1.0,
)
)
# prev
edges.append(
_mk_edge_payload(
kind="prev",
scope="chunk",
note_id=note_id,
chunk_id=next_id,
source_id=next_id,
target_id=prev_id,
rule_id="structure:prev",
confidence=1.0,
)
)
# ------------------------------------------------------------------
# 2) Explizite Referenzen (Wikilinks) + Inline-Relationen + Callouts
# - Alles chunk-scope, Quelle = chunk_id (falls vorhanden),
# sonst Note-scope als Fallback.
# ------------------------------------------------------------------
# Sammle alle expliziten Ziele (für spätere edge_defaults)
explicit_targets: Set[str] = set()
# pro Chunk prüfen
for c in chunks:
cid = c.get("chunk_id")
ctxt = c.get("text") or ""
# 2a) Wikilinks -> references
for tgt in _extract_wikilinks(ctxt):
explicit_targets.add(tgt)
edges.append(
_mk_edge_payload(
kind="references",
scope="chunk",
note_id=note_id,
chunk_id=cid,
source_id=cid,
target_id=tgt,
rule_id="explicit:wikilink",
confidence=1.0,
)
)
# 2b) Inline-Relationen (mehrere Ziele erlaubt)
for rel, targets in _extract_inline_relations_lines(ctxt):
for tgt in targets:
explicit_targets.add(tgt)
edges.append(
_mk_edge_payload(
kind=rel,
scope="chunk",
note_id=note_id,
chunk_id=cid,
source_id=cid,
target_id=tgt,
rule_id="inline:rel",
confidence=0.95,
)
)
# 2c) Callout-Edges (mehrere Ziele erlaubt)
for rel, targets in _extract_callout_edges(ctxt):
for tgt in targets:
explicit_targets.add(tgt)
edges.append(
_mk_edge_payload(
kind=rel,
scope="chunk",
note_id=note_id,
chunk_id=cid,
source_id=cid,
target_id=tgt,
rule_id="callout:edge",
confidence=0.9,
)
)
# Fallback: Falls Note keinen Chunk-Text enthielt (theoretisch),
# prüfe Note-Text einmal global (liefert note-scope Kanten).
if not chunks and note_text:
# Wikilinks
for tgt in _extract_wikilinks(note_text):
explicit_targets.add(tgt)
edges.append(
_mk_edge_payload(
kind="references",
scope="note",
note_id=note_id,
source_id=note_id,
target_id=tgt,
rule_id="explicit:wikilink",
confidence=1.0,
)
)
# Inline
for rel, targets in _extract_inline_relations_lines(note_text):
for tgt in targets:
explicit_targets.add(tgt)
edges.append(
_mk_edge_payload(
kind=rel,
scope="note",
note_id=note_id,
source_id=note_id,
target_id=tgt,
rule_id="inline:rel",
confidence=0.95,
)
)
# Callouts
for rel, targets in _extract_callout_edges(note_text):
for tgt in targets:
explicit_targets.add(tgt)
edges.append(
_mk_edge_payload(
kind=rel,
scope="note",
note_id=note_id,
source_id=note_id,
target_id=tgt,
rule_id="callout:edge",
confidence=0.9,
)
)
# ------------------------------------------------------------------
# 3) Typbasierte Default-Kanten (edge_defaults)
# - nur, wenn es explizite Ziele gibt (sonst kein Ableitungsanker)
# ------------------------------------------------------------------
if types_cfg and explicit_targets:
type_entry = (types_cfg.get("types") or {}).get(note_type) or {}
defaults: List[str] = type_entry.get("edge_defaults") or []
defaults = [str(d).strip().lower() for d in defaults if str(d).strip()]
if defaults:
# default-Kanten als "note"-Scope (Konzeption: vom Note-Kontext aus)
for rel in defaults:
rule = f"edge_defaults:{note_type}:{rel}"
for tgt in sorted(explicit_targets):
edges.append(
_mk_edge_payload(
kind=rel,
scope="note",
note_id=note_id,
source_id=note_id,
target_id=tgt,
rule_id=rule,
confidence=0.7,
)
)
# ------------------------------------------------------------------
# 4) De-Duplizierung (idempotent): Schlüssel (kind, scope, source_id, target_id, rule_id)
# ------------------------------------------------------------------
seen: Set[Tuple[str, str, str, str, str]] = set()
uniq: List[Dict] = []
for e in edges:
key = (e["kind"], e["scope"], e["source_id"], e["target_id"], e["rule_id"])
if key in seen:
continue
seen.add(key)
uniq.append(e)
return uniq