app/core/derive_edges.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-11-17 10:57:30 +01:00
parent 300086fc83
commit 4228b8a74f

View File

@ -1,120 +1,210 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Modul: app/core/derive_edges.py app/core/derive_edges.py
Version: 1.4.0 Mindnet V2 Edge-Ableitung (real + defaults), idempotent
Robuste Kantenbildung für mindnet (Notes/Chunks): Erzeugt Kanten für eine Note aus:
- belongs_to (chunk -> note) 1) Sequenzkanten pro Chunk: belongs_to, next, prev
- next / prev (chunk-Kette) 2) Reale Referenzen aus Chunk-Text (Markdown-Links, Wikilinks) + optional Frontmatter-Refs
- references (chunk-scope) aus Chunk.window/text 3) Abgeleitete Kanten je Typ-Regel (types.yaml.edge_defaults), z. B. additional relations wie "depends_on", "related_to"
- optional references/backlink (note-scope) - Regel-Tagging via rule_id="edge_defaults:<type>:<relation>"
- De-Dupe via Key: (source_id, target_id, relation, rule_id)
Wichtig: Wikilinks werden mit der Parser-Funktion `extract_wikilinks` extrahiert, Edge-Payload-Minimum:
damit Varianten wie [[id#anchor]] oder [[id|label]] korrekt auf 'id' reduziert werden. - relation (alias: kind)
- note_id (Quelle; also die ID der Note, zu der die Chunks gehören)
- source_id (Chunk-ID oder Note-ID, je nach scope)
- target_id (Note-/Slug-/URL-ID; deterministisch normalisiert)
- chunk_id (falls scope='chunk')
- scope: 'chunk'|'note'
- confidence: float (bei abgeleitet z. B. 0.7)
- rule_id: str | None
""" """
from __future__ import annotations from __future__ import annotations
from typing import Any, Dict, Iterable, List, Optional, Tuple
import os, re, yaml, hashlib
from typing import Dict, List, Optional, Iterable # ---------------- Registry Laden ----------------
# Parser-Extraktion für saubere Wikilinks def _env(n: str, d: Optional[str]=None) -> str:
from app.core.parser import extract_wikilinks v = os.getenv(n)
return v if v is not None else (d or "")
def _get(d: dict, *keys, default=None): def _load_types() -> dict:
for k in keys: p = _env("MINDNET_TYPES_FILE", "./config/types.yaml")
if k in d and d[k] is not None: try:
return d[k] with open(p, "r", encoding="utf-8") as f:
return default return yaml.safe_load(f) or {}
except Exception:
return {}
def _chunk_text_for_refs(chunk: dict) -> str: def _get_types_map(reg: dict) -> dict:
# bevorzugt 'window' → dann 'text' → 'content' → 'raw' if isinstance(reg, dict) and isinstance(reg.get("types"), dict):
return ( return reg["types"]
_get(chunk, "window") return reg if isinstance(reg, dict) else {}
or _get(chunk, "text")
or _get(chunk, "content")
or _get(chunk, "raw")
or ""
)
def _dedupe(seq: Iterable[str]) -> List[str]: def _edge_defaults_for(note_type: str, reg: dict) -> List[str]:
seen = set() m = _get_types_map(reg)
out: List[str] = [] if isinstance(m, dict):
for s in seq: t = m.get(note_type) or {}
if s not in seen: if isinstance(t, dict):
seen.add(s) vals = t.get("edge_defaults")
out.append(s) if isinstance(vals, list):
return out return [str(x) for x in vals if isinstance(x, (str,))]
return []
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: # ---------------- Utils ----------------
pl = {
"kind": kind, SYM_REL = {"related_to", "similar_to"} # symmetrische Relationen
"scope": scope, # "chunk" | "note"
def _slug_id(s: str) -> str:
s = (s or "").strip().lower()
s = re.sub(r"\s+", "-", s)
s = re.sub(r"[^\w\-:/#\.]", "", s) # lasse urls, hashes rudimentär zu
if not s:
s = "ref"
return s
def _mk_edge_id(source_id: str, relation: str, target_id: str, rule_id: Optional[str]) -> str:
base = f"{source_id}|{relation}|{target_id}|{rule_id or ''}"
h = hashlib.sha1(base.encode("utf-8")).hexdigest()[:16]
return f"e_{h}"
def _add(edge_list: List[Dict[str, Any]],
dedupe: set,
note_id: str,
source_id: str,
relation: str,
target_id: str,
*,
chunk_id: Optional[str] = None,
scope: str = "chunk",
confidence: Optional[float] = None,
rule_id: Optional[str] = None) -> None:
key = (source_id, target_id, relation, rule_id or "")
if key in dedupe:
return
dedupe.add(key)
payload = {
"edge_id": _mk_edge_id(source_id, relation, target_id, rule_id),
"note_id": note_id,
"kind": relation, # alias
"relation": relation,
"scope": scope,
"source_id": source_id, "source_id": source_id,
"target_id": target_id, "target_id": target_id,
"note_id": note_id, # Träger/Quelle der Kante (aktuelle Note)
} }
if extra: if chunk_id:
pl.update(extra) payload["chunk_id"] = chunk_id
return pl if confidence is not None:
payload["confidence"] = float(confidence)
if rule_id is not None:
payload["rule_id"] = rule_id
edge_list.append(payload)
def build_edges_for_note( # ---------------- Refs Parsen ----------------
MD_LINK = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") # [text](target)
WIKI_LINK = re.compile(r"\[\[([^|\]]+)(?:\|[^]]+)?\]\]") # [[Title]] oder [[Title|alias]]
def _extract_refs(text: str) -> List[Tuple[str, str]]:
"""liefert Liste (label, target) target kann URL, Title, etc. sein"""
out: List[Tuple[str,str]] = []
if not text:
return out
for m in MD_LINK.finditer(text):
label = (m.group(1) or "").strip()
tgt = (m.group(2) or "").strip()
out.append((label, tgt))
for m in WIKI_LINK.finditer(text):
title = (m.group(1) or "").strip()
out.append((title, title))
return out
# ---------------- Haupt-API ----------------
def build_edges_for_note(*,
note_id: str, note_id: str,
chunks: List[dict], chunk_payloads: List[Dict[str, Any]],
note_level_references: Optional[List[str]] = None, note_level_refs: Optional[List[Dict[str, Any]]] = None,
include_note_scope_refs: bool = False, include_note_scope_refs: bool = False) -> List[Dict[str, Any]]:
) -> List[dict]:
""" """
Erzeugt Kanten für eine Note. Baut alle Kanten für eine Note.
- Sequenzkanten (belongs_to, next, prev)
- belongs_to: für jeden Chunk (chunk -> note) - Referenzen aus Chunk-Text (scope=chunk)
- next / prev: zwischen aufeinanderfolgenden Chunks - Abgeleitete Kanten gemäß edge_defaults aus types.yaml (für jede gefundene Referenz)
- references: pro Chunk aus window/text
- optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references
""" """
edges: List[dict] = [] note_type = None
if chunk_payloads:
note_type = chunk_payloads[0].get("type")
reg = _load_types()
defaults = _edge_defaults_for(note_type or "concept", reg)
edges: List[Dict[str, Any]] = []
seen = set()
# 1) Sequenzkanten
for ch in chunk_payloads:
cid = ch.get("chunk_id") or ch.get("id")
nid = ch.get("note_id") or note_id
idx = ch.get("index")
# belongs_to # belongs_to
for ch in chunks: _add(edges, seen, note_id=nid, source_id=cid, relation="belongs_to",
cid = _get(ch, "chunk_id", "id") target_id=nid, chunk_id=cid, scope="chunk")
if not cid:
continue
edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, {"chunk_id": cid}))
# next/prev # next/prev
for i in range(len(chunks) - 1): for nb, rel in ((ch.get("neighbors_next"), "next"), (ch.get("neighbors_prev"), "prev")):
a, b = chunks[i], chunks[i + 1] if not nb:
a_id = _get(a, "chunk_id", "id")
b_id = _get(b, "chunk_id", "id")
if not a_id or not b_id:
continue continue
edges.append(_edge("next", "chunk", a_id, b_id, note_id, {"chunk_id": a_id})) # neighbors sind Listen
edges.append(_edge("prev", "chunk", b_id, a_id, note_id, {"chunk_id": b_id})) items = nb if isinstance(nb, list) else [nb]
for tid in items:
_add(edges, seen, note_id=nid, source_id=cid, relation=rel,
target_id=tid, chunk_id=cid, scope="chunk")
# references (chunk-scope) Links aus window bevorzugen (Overlap-fest) # 2) Refs aus Chunk-Text (+ derived edges je ref)
refs_all: List[str] = [] for ch in chunk_payloads:
for ch in chunks: cid = ch.get("chunk_id") or ch.get("id")
cid = _get(ch, "chunk_id", "id") nid = ch.get("note_id") or note_id
if not cid: text = ch.get("text") or ""
for (label, tgt) in _extract_refs(text):
target_id = _slug_id(tgt)
# real reference
_add(edges, seen, note_id=nid, source_id=cid, relation="references",
target_id=target_id, chunk_id=cid, scope="chunk")
# defaults amplification
for rel in defaults:
if rel == "references":
continue continue
txt = _chunk_text_for_refs(ch) rule = f"edge_defaults:{note_type}:{rel}"
refs = extract_wikilinks(txt) # Parser-Logik _add(edges, seen, note_id=nid, source_id=cid, relation=rel,
for r in refs: target_id=target_id, chunk_id=cid, scope="chunk",
edges.append(_edge("references", "chunk", cid, r, note_id, {"chunk_id": cid, "ref_text": r})) confidence=0.7, rule_id=rule)
refs_all.extend(refs) # symmetrisch?
if rel in SYM_REL:
_add(edges, seen, note_id=nid, source_id=target_id, relation=rel,
target_id=cid, chunk_id=cid, scope="chunk",
confidence=0.7, rule_id=rule)
# optional: note-scope references/backlinks # 3) optionale Note-Scope-Refs aus Frontmatter (falls geliefert)
if include_note_scope_refs: note_level_refs = note_level_refs or []
refs_note = refs_all[:] if include_note_scope_refs and note_level_refs:
if note_level_references: nid = note_id
refs_note.extend([r for r in note_level_references if isinstance(r, str) and r]) for r in note_level_refs:
refs_note = _dedupe(refs_note) tgt = (r or {}).get("target_id") or (r or {}).get("target") or ""
for r in refs_note: if not tgt:
edges.append(_edge("references", "note", note_id, r, note_id)) continue
edges.append(_edge("backlink", "note", r, note_id, note_id)) target_id = _slug_id(str(tgt))
_add(edges, seen, note_id=nid, source_id=nid, relation="references",
target_id=target_id, chunk_id=None, scope="note")
for rel in defaults:
if rel == "references":
continue
rule = f"edge_defaults:{note_type}:{rel}"
_add(edges, seen, note_id=nid, source_id=nid, relation=rel,
target_id=target_id, chunk_id=None, scope="note",
confidence=0.7, rule_id=rule)
return edges
# Dedupe (Schlüssel: kind,source_id,target_id,scope)
dedup = {}
for e in edges:
k = (e["kind"], e["source_id"], e["target_id"], e.get("scope", ""))
dedup[k] = e
return list(dedup.values())