All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
365 lines
12 KiB
Python
365 lines
12 KiB
Python
# app/core/derive_edges.py
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Edge-Ableitung (V2)
|
||
Beibehaltung der bestehenden Funktionalität + Erweiterung:
|
||
- Mehrere Inline-Referenzen in einer Zeile: rel: <relation> [[A]] [[B]] ...
|
||
Kompatibel mit:
|
||
- Strukturkanten: belongs_to / next / prev
|
||
- Explizite Wikilinks -> references
|
||
- Inline-Relationen -> inline:rel
|
||
- Callout-Kanten -> callout:edge
|
||
- Typbasierte Default-Kanten (edge_defaults aus types.yaml)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
import re
|
||
from typing import Dict, List, Iterable, Tuple, Set
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Regex-Bausteine
|
||
# ----------------------------------------------------------------------
|
||
|
||
# Wikilinks: [[Title]] oder [[Title|Alias]]
|
||
RE_WIKILINK = re.compile(r"\[\[([^\]|#]+)(?:#[^\]|]+)?(?:\|[^\]]+)?\]\]")
|
||
|
||
# Inline-Relationen (Variante B – von dir im Einsatz):
|
||
# rel: <relation> [[Target A]] [[Target B]] ...
|
||
RE_INLINE_REL_LINE = re.compile(
|
||
r"(?i)\brel\s*:\s*(?P<rel>[a-z_][a-z0-9_]+)\s+(?P<body>.+)$"
|
||
)
|
||
|
||
# Callout:
|
||
# > [!edge] <relation>: [[A]] [[B]]
|
||
RE_CALLOUT_HEADER = re.compile(r"^\s{0,3}>\s*\[\!edge\]\s*(?P<rel>[a-z_][a-z0-9_]+)\s*:\s*(?P<body>.*)$", re.IGNORECASE)
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Utilities
|
||
# ----------------------------------------------------------------------
|
||
|
||
def _neighbors_chain(chunk_ids: List[str]) -> Iterable[Tuple[str, str]]:
|
||
"""Erzeugt (prev, next) Paare entlang der Chunk-Sequenz."""
|
||
for i in range(len(chunk_ids) - 1):
|
||
yield chunk_ids[i], chunk_ids[i + 1]
|
||
|
||
def _mk_edge_payload(
|
||
*,
|
||
kind: str,
|
||
scope: str,
|
||
note_id: str,
|
||
chunk_id: str | None = None,
|
||
source_id: str,
|
||
target_id: str,
|
||
rule_id: str,
|
||
confidence: float,
|
||
) -> Dict:
|
||
"""
|
||
Einheitliches Edge-Payload-Format.
|
||
"""
|
||
pl = {
|
||
"kind": kind, # z.B. references, depends_on, related_to, similar_to
|
||
"scope": scope, # "chunk" oder "note"
|
||
"note_id": note_id, # Note-Kontext (Quelle)
|
||
"source_id": source_id, # id der Quelle (Chunk-ID oder Note-ID)
|
||
"target_id": target_id, # Ziel (Note-ID oder Titel, falls Auflösung extern erfolgt)
|
||
"rule_id": rule_id,
|
||
"confidence": confidence,
|
||
}
|
||
if chunk_id:
|
||
pl["chunk_id"] = chunk_id
|
||
return pl
|
||
|
||
def _extract_wikilinks(text: str) -> List[str]:
|
||
"""
|
||
Extrahiert alle Wikilink-Ziele (als Titel-Strings).
|
||
"""
|
||
return [m.group(1).strip() for m in RE_WIKILINK.finditer(text or "")]
|
||
|
||
def _extract_inline_relations_lines(text: str) -> List[Tuple[str, List[str]]]:
|
||
"""
|
||
Findet Inline-Relationen in Zeilen wie:
|
||
rel: <relation> [[Target A]] [[Target B]]
|
||
Liefert Liste von (relation, [targets...]).
|
||
"""
|
||
out: List[Tuple[str, List[str]]] = []
|
||
if not text:
|
||
return out
|
||
for line in text.splitlines():
|
||
m = RE_INLINE_REL_LINE.search(line)
|
||
if not m:
|
||
continue
|
||
rel = m.group("rel").strip().lower()
|
||
body = m.group("body")
|
||
# alle [[...]] Ziele aus body herausziehen:
|
||
targets = _extract_wikilinks(body)
|
||
# falls im Body keine [[...]] vorkommen, versuche verbleibenden Text als ein Ziel (robust):
|
||
if not targets:
|
||
cleaned = body.strip()
|
||
if cleaned:
|
||
targets = [cleaned]
|
||
if targets:
|
||
out.append((rel, targets))
|
||
return out
|
||
|
||
def _extract_callout_edges(text: str) -> List[Tuple[str, List[str]]]:
|
||
"""
|
||
Callout-Edges:
|
||
> [!edge] <relation>: [[A]] [[B]]
|
||
pro Zeile eine Relation + 1..n Ziele
|
||
"""
|
||
out: List[Tuple[str, List[str]]] = []
|
||
if not text:
|
||
return out
|
||
for line in text.splitlines():
|
||
m = RE_CALLOUT_HEADER.match(line)
|
||
if not m:
|
||
continue
|
||
rel = m.group("rel").strip().lower()
|
||
body = m.group("body")
|
||
targets = _extract_wikilinks(body)
|
||
# Robustheit: wenn keine [[...]] vorhanden, restlicher body als ein Ziel
|
||
if not targets:
|
||
cleaned = body.strip()
|
||
if cleaned:
|
||
targets = [cleaned]
|
||
if targets:
|
||
out.append((rel, targets))
|
||
return out
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Haupt-API
|
||
# ----------------------------------------------------------------------
|
||
|
||
def derive_edges(
|
||
note: Dict,
|
||
chunks: List[Dict],
|
||
types_cfg: Dict | None = None,
|
||
) -> List[Dict]:
|
||
"""
|
||
Leitet Kanten für eine Note ab.
|
||
|
||
Erwartete Felder:
|
||
note: {
|
||
"note_id": str,
|
||
"title": str,
|
||
"type": str,
|
||
"text": str
|
||
}
|
||
chunks: [{
|
||
"chunk_id": str,
|
||
"index": int,
|
||
"text": str,
|
||
...
|
||
}, ...]
|
||
|
||
types_cfg (aus types.yaml geladen) mit:
|
||
types_cfg["types"][<type>]["edge_defaults"] = [relation, ...]
|
||
(optional)
|
||
"""
|
||
edges: List[Dict] = []
|
||
|
||
note_id = note.get("note_id") or note.get("id")
|
||
note_title = note.get("title") or ""
|
||
note_type = (note.get("type") or "").strip().lower()
|
||
note_text = note.get("text") or ""
|
||
|
||
# ------------------------------------------------------------------
|
||
# 1) Strukturkanten je Chunk: belongs_to / next / prev
|
||
# ------------------------------------------------------------------
|
||
chunk_ids = [c.get("chunk_id") for c in chunks if c.get("chunk_id")]
|
||
# belongs_to
|
||
for c in chunks:
|
||
cid = c.get("chunk_id")
|
||
if not cid:
|
||
continue
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind="belongs_to",
|
||
scope="chunk",
|
||
note_id=note_id,
|
||
chunk_id=cid,
|
||
source_id=cid,
|
||
target_id=note_id,
|
||
rule_id="structure:belongs_to",
|
||
confidence=1.0,
|
||
)
|
||
)
|
||
# next/prev
|
||
for prev_id, next_id in _neighbors_chain(chunk_ids):
|
||
# next
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind="next",
|
||
scope="chunk",
|
||
note_id=note_id,
|
||
chunk_id=prev_id,
|
||
source_id=prev_id,
|
||
target_id=next_id,
|
||
rule_id="structure:next",
|
||
confidence=1.0,
|
||
)
|
||
)
|
||
# prev
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind="prev",
|
||
scope="chunk",
|
||
note_id=note_id,
|
||
chunk_id=next_id,
|
||
source_id=next_id,
|
||
target_id=prev_id,
|
||
rule_id="structure:prev",
|
||
confidence=1.0,
|
||
)
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 2) Explizite Referenzen (Wikilinks) + Inline-Relationen + Callouts
|
||
# - Alles chunk-scope, Quelle = chunk_id (falls vorhanden),
|
||
# sonst Note-scope als Fallback.
|
||
# ------------------------------------------------------------------
|
||
# Sammle alle expliziten Ziele (für spätere edge_defaults)
|
||
explicit_targets: Set[str] = set()
|
||
|
||
# pro Chunk prüfen
|
||
for c in chunks:
|
||
cid = c.get("chunk_id")
|
||
ctxt = c.get("text") or ""
|
||
|
||
# 2a) Wikilinks -> references
|
||
for tgt in _extract_wikilinks(ctxt):
|
||
explicit_targets.add(tgt)
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind="references",
|
||
scope="chunk",
|
||
note_id=note_id,
|
||
chunk_id=cid,
|
||
source_id=cid,
|
||
target_id=tgt,
|
||
rule_id="explicit:wikilink",
|
||
confidence=1.0,
|
||
)
|
||
)
|
||
|
||
# 2b) Inline-Relationen (mehrere Ziele erlaubt)
|
||
for rel, targets in _extract_inline_relations_lines(ctxt):
|
||
for tgt in targets:
|
||
explicit_targets.add(tgt)
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind=rel,
|
||
scope="chunk",
|
||
note_id=note_id,
|
||
chunk_id=cid,
|
||
source_id=cid,
|
||
target_id=tgt,
|
||
rule_id="inline:rel",
|
||
confidence=0.95,
|
||
)
|
||
)
|
||
|
||
# 2c) Callout-Edges (mehrere Ziele erlaubt)
|
||
for rel, targets in _extract_callout_edges(ctxt):
|
||
for tgt in targets:
|
||
explicit_targets.add(tgt)
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind=rel,
|
||
scope="chunk",
|
||
note_id=note_id,
|
||
chunk_id=cid,
|
||
source_id=cid,
|
||
target_id=tgt,
|
||
rule_id="callout:edge",
|
||
confidence=0.9,
|
||
)
|
||
)
|
||
|
||
# Fallback: Falls Note keinen Chunk-Text enthielt (theoretisch),
|
||
# prüfe Note-Text einmal global (liefert note-scope Kanten).
|
||
if not chunks and note_text:
|
||
# Wikilinks
|
||
for tgt in _extract_wikilinks(note_text):
|
||
explicit_targets.add(tgt)
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind="references",
|
||
scope="note",
|
||
note_id=note_id,
|
||
source_id=note_id,
|
||
target_id=tgt,
|
||
rule_id="explicit:wikilink",
|
||
confidence=1.0,
|
||
)
|
||
)
|
||
# Inline
|
||
for rel, targets in _extract_inline_relations_lines(note_text):
|
||
for tgt in targets:
|
||
explicit_targets.add(tgt)
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind=rel,
|
||
scope="note",
|
||
note_id=note_id,
|
||
source_id=note_id,
|
||
target_id=tgt,
|
||
rule_id="inline:rel",
|
||
confidence=0.95,
|
||
)
|
||
)
|
||
# Callouts
|
||
for rel, targets in _extract_callout_edges(note_text):
|
||
for tgt in targets:
|
||
explicit_targets.add(tgt)
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind=rel,
|
||
scope="note",
|
||
note_id=note_id,
|
||
source_id=note_id,
|
||
target_id=tgt,
|
||
rule_id="callout:edge",
|
||
confidence=0.9,
|
||
)
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 3) Typbasierte Default-Kanten (edge_defaults)
|
||
# - nur, wenn es explizite Ziele gibt (sonst kein Ableitungsanker)
|
||
# ------------------------------------------------------------------
|
||
if types_cfg and explicit_targets:
|
||
type_entry = (types_cfg.get("types") or {}).get(note_type) or {}
|
||
defaults: List[str] = type_entry.get("edge_defaults") or []
|
||
defaults = [str(d).strip().lower() for d in defaults if str(d).strip()]
|
||
if defaults:
|
||
# default-Kanten als "note"-Scope (Konzeption: vom Note-Kontext aus)
|
||
for rel in defaults:
|
||
rule = f"edge_defaults:{note_type}:{rel}"
|
||
for tgt in sorted(explicit_targets):
|
||
edges.append(
|
||
_mk_edge_payload(
|
||
kind=rel,
|
||
scope="note",
|
||
note_id=note_id,
|
||
source_id=note_id,
|
||
target_id=tgt,
|
||
rule_id=rule,
|
||
confidence=0.7,
|
||
)
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 4) De-Duplizierung (idempotent): Schlüssel (kind, scope, source_id, target_id, rule_id)
|
||
# ------------------------------------------------------------------
|
||
seen: Set[Tuple[str, str, str, str, str]] = set()
|
||
uniq: List[Dict] = []
|
||
for e in edges:
|
||
key = (e["kind"], e["scope"], e["source_id"], e["target_id"], e["rule_id"])
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
uniq.append(e)
|
||
|
||
return uniq
|