mindnet/app/core/derive_edges.py
Lars 6ea452cc3f
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
app/core/derive_edges.py aktualisiert
2025-10-01 10:54:46 +02:00

132 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Modul: app/core/derive_edges.py
Version: 1.4.0
Datum: 2025-10-01
Zweck
-----
Robuste Kantenbildung für mindnet (Notes/Chunks):
- belongs_to (chunk -> note)
- next / prev (chunk-Kette)
- references (chunk-scope) aus Chunk.window/text
- optional references/backlink (note-scope)
Wichtig: Wikilinks werden mit der Parser-Funktion `extract_wikilinks` extrahiert,
damit Varianten wie [[id#anchor]] oder [[id|label]] korrekt auf 'id' reduziert werden.
Erwartete Chunk-Payload-Felder:
{
"note_id": "...",
"chunk_id": "...", # Alias "id" ist zulässig
"id": "...",
"chunk_index": int,
"seq": int,
"window": str,
"text": str,
"path": "rel/path.md",
...
}
"""
from __future__ import annotations
from typing import Dict, List, Optional, Iterable
# WICHTIG: benutze die Parser-Extraktion für saubere Wikilinks
from app.core.parser import extract_wikilinks
def _get(d: dict, *keys, default=None):
for k in keys:
if k in d and d[k] is not None:
return d[k]
return default
def _chunk_text_for_refs(chunk: dict) -> str:
# bevorzugt 'window' → dann 'text' → 'content' → 'raw'
return (
_get(chunk, "window")
or _get(chunk, "text")
or _get(chunk, "content")
or _get(chunk, "raw")
or ""
)
def _dedupe(seq: Iterable[str]) -> List[str]:
seen = set()
out: List[str] = []
for s in seq:
if s not in seen:
seen.add(s)
out.append(s)
return out
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
pl = {
"kind": kind,
"scope": scope, # "chunk" | "note"
"source_id": source_id,
"target_id": target_id,
"note_id": note_id, # Träger/Quelle der Kante (aktuelle Note)
}
if extra:
pl.update(extra)
return pl
def build_edges_for_note(
note_id: str,
chunks: List[dict],
note_level_references: Optional[List[str]] = None,
include_note_scope_refs: bool = False,
) -> List[dict]:
"""
Erzeugt Kanten für eine Note.
- belongs_to: für jeden Chunk (chunk -> note)
- next / prev: zwischen aufeinanderfolgenden Chunks
- references: pro Chunk aus window/text
- optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references
"""
edges: List[dict] = []
# belongs_to
for ch in chunks:
cid = _get(ch, "chunk_id", "id")
if not cid:
continue
edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, {"chunk_id": cid}))
# next/prev
for i in range(len(chunks) - 1):
a, b = chunks[i], chunks[i + 1]
a_id = _get(a, "chunk_id", "id")
b_id = _get(b, "chunk_id", "id")
if not a_id or not b_id:
continue
edges.append(_edge("next", "chunk", a_id, b_id, note_id, {"chunk_id": a_id}))
edges.append(_edge("prev", "chunk", b_id, a_id, note_id, {"chunk_id": b_id}))
# references (chunk-scope) Links aus window bevorzugen (Overlap-fest)
refs_all: List[str] = []
for ch in chunks:
cid = _get(ch, "chunk_id", "id")
if not cid:
continue
txt = _chunk_text_for_refs(ch)
refs = extract_wikilinks(txt) # <— Parser-Logik, kompatibel zu deinem System
for r in refs:
edges.append(_edge("references", "chunk", cid, r, note_id, {"chunk_id": cid, "ref_text": r}))
refs_all.extend(refs)
# optional: note-scope references/backlinks
if include_note_scope_refs:
refs_note = refs_all[:]
if note_level_references:
refs_note.extend([r for r in note_level_references if isinstance(r, str) and r])
refs_note = _dedupe(refs_note)
for r in refs_note:
edges.append(_edge("references", "note", note_id, r, note_id))
edges.append(_edge("backlink", "note", r, note_id, note_id))
return edges