From 7cda15553d171d8a2e351e417769f61b5a61c155 Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 30 Sep 2025 12:36:19 +0200 Subject: [PATCH] app/core/derive_edges.py aktualisiert --- app/core/derive_edges.py | 255 +++++++++++++++++++++++---------------- 1 file changed, 149 insertions(+), 106 deletions(-) diff --git a/app/core/derive_edges.py b/app/core/derive_edges.py index ab9c7b6..0a86d23 100644 --- a/app/core/derive_edges.py +++ b/app/core/derive_edges.py @@ -1,126 +1,169 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Name: app/core/derive_edges.py -Version: v1.1.0 (2025-09-05) +Modul: app/core/derive_edges.py +Version: 1.3.0 +Datum: 2025-09-30 -Kurzbeschreibung - Leitet Edges aus Wikilinks ([[…]]) ab und löst Zielnoten robust auf. - Erzeugt: - - "references" (Note -> Note) mit seq="body", pro Match eine eigene Occurrence 'occ' - - "backlink" (inverse zu "references", gleiche seq/occ) - - "references_at" (Chunk -> Note) mit seq= und eigener 'occ' je Match +Zweck +----- +Robuste Kantenbildung für mindnet (Notes/Chunks): +- belongs_to (chunk -> note) +- next / prev (chunk-Kette) +- references (chunk-scope) aus Chunk.window (Fallback text/content/raw) +- optional references (note-scope) dedupliziert +- optional backlink (note-scope) als Gegenkante -Aufruf - from app.core.derive_edges import build_note_index, derive_wikilink_edges +Designhinweise +-------------- +- Für die Referenz-Extraktion wird bewusst das Feld **window** verwendet (nicht 'text'), + damit Links, die an einer Overlap-Grenze liegen, nicht verloren gehen. +- IDs werden später deterministisch in app/core/qdrant_points.py aus Payload erzeugt. +- 'status' setzen wir nicht hart; ein separater Resolver kann 'unresolved' o.ä. bestimmen. -Parameter / Felder - - note_payload: {"note_id","title","path","fulltext": , …} - - chunks_payloads: [{"chunk_id","text",…}, …] - - note_index: build_note_index([...]) -> (by_id, by_slug, by_file_slug) +Erwartete Chunk-Payload-Felder (pro Element): + { + "note_id": "...", + "chunk_id": "...", # Alias "id" sollte ebenfalls vorhanden sein (abwärtskompatibel) + "id": "...", + "chunk_index": int, + "seq": int, + "window": str, + "text": str, + "path": "rel/path.md", + ... + } -Kompatibilität - - Rückwärtskompatible Payload-Felder, nur erweitert um 'seq' und 'occ'. - -Changelog - v1.1.0: Occurrence-Zählung ('occ') je Match; 'seq="body"' für references. +API +--- +def build_edges_for_note( + note_id: str, + chunks: List[dict], + note_level_references: List[str] | None = None, + include_note_scope_refs: bool = False, +) -> List[dict] """ - from __future__ import annotations + import re -import unicodedata -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Iterable, Tuple -# [[Ziel]], [[Ziel|Alias]], [[Ziel#Heading]], [[Ziel#Heading|Alias]] -WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#([^\]|]+))?(?:\|([^\]]+))?\]\]") +_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") -def _slug(s: str) -> str: - s = s.strip() - if s.endswith(".md"): - s = s[:-3] - s = unicodedata.normalize("NFKD", s) - s = "".join(ch for ch in s if not unicodedata.combining(ch)) - s = s.replace("\\", "/") - s = s.split("/")[-1] - s = s.lower().replace(" ", "-") - s = re.sub(r"[^a-z0-9\-]+", "", s) - s = re.sub(r"-{2,}", "-", s).strip("-") - return s +def _get(d: dict, *keys, default=None): + for k in keys: + if k in d and d[k] is not None: + return d[k] + return default -def build_note_index(notes_payloads: List[dict]) -> Tuple[Dict[str, dict], Dict[str, dict], Dict[str, dict]]: - by_id: Dict[str, dict] = {} - by_slug: Dict[str, dict] = {} - by_file_slug: Dict[str, dict] = {} - for n in notes_payloads: - nid = n.get("note_id") or n.get("id") - if not nid: +def _chunk_text_for_refs(chunk: dict) -> str: + # bevorzugt 'window' → dann 'text' → 'content' → 'raw' + return ( + _get(chunk, "window") + or _get(chunk, "text") + or _get(chunk, "content") + or _get(chunk, "raw") + or "" + ) + +def _extract_wikilinks(text: str) -> List[str]: + if not text: + return [] + out: List[str] = [] + for m in _WIKILINK_RE.finditer(text): + label = m.group(1).strip() + if not label: continue - by_id[nid] = n - title = n.get("title", "") - path = n.get("path", "") - file_slug = _slug(path.split("/")[-1]) if path else "" - if title: - by_slug[_slug(title)] = n - if file_slug: - by_file_slug[file_slug] = n - return by_id, by_slug, by_file_slug + # Einfach-Normalisierung: Leerraum trimmen; weitere Normalisierung (Slugging) + # kann upstream erfolgen (Parser oder Resolver). + out.append(label) + return out -def resolve_target(note_like: str, idx: Tuple[Dict[str,dict],Dict[str,dict],Dict[str,dict]]): - by_id, by_slug, by_file_slug = idx - key = note_like.strip() - if key in by_id: - return by_id[key]["note_id"], "by_id" - s = _slug(key) - if s in by_slug: - return by_slug[s]["note_id"], "by_slug" - if s in by_file_slug: - return by_file_slug[s]["note_id"], "by_file_slug" - return None, "unresolved" +def _dedupe(seq: Iterable[str]) -> List[str]: + seen = set() + out: List[str] = [] + for s in seq: + if s not in seen: + seen.add(s) + out.append(s) + return out -def derive_wikilink_edges(note_payload: dict, chunks_payloads: List[dict], note_index) -> List[dict]: +def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: + pl = { + "kind": kind, + "scope": scope, # "chunk" | "note" + "source_id": source_id, + "target_id": target_id, + "note_id": note_id, # Quelle/Träger der Kante (die aktuelle Note) + } + if extra: + pl.update(extra) + return pl + +def build_edges_for_note( + note_id: str, + chunks: List[dict], + note_level_references: Optional[List[str]] = None, + include_note_scope_refs: bool = False, +) -> List[dict]: + """ + Erzeugt Kanten für eine Note. + + - belongs_to: für jeden Chunk (chunk -> note) + - next / prev: zwischen aufeinanderfolgenden Chunks + - references: pro Chunk aus window/text + - optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references + + Rückgabe: Liste von Edge-Payloads (ohne 'id'; Qdrant-ID wird deterministisch aus Payload erzeugt) + """ edges: List[dict] = [] - source_note_id = note_payload["note_id"] - def _make_edge(kind: str, src: str, tgt: str, seq=None, occ=None, extra: dict|None=None): - e = {"edge_id": None, "kind": kind, "source_id": src, "target_id": tgt} - if seq is not None: - e["seq"] = seq - if occ is not None: - e["occ"] = occ - if extra: - e.update(extra) - return e - - # Volltext (Note-Ebene) - fulltext = note_payload.get("fulltext") or note_payload.get("body") or "" - if fulltext: - for k, m in enumerate(WIKILINK_RE.finditer(fulltext), start=1): - raw_target, heading, alias = m.groups() - target_id, how = resolve_target(raw_target, note_index) - extra = {"raw": raw_target, "alias": alias, "heading": heading, "resolution": how} - if target_id: - edges.append(_make_edge("references", source_note_id, target_id, seq="body", occ=k, extra=extra)) - edges.append(_make_edge("backlink", target_id, source_note_id, seq="body", occ=k, extra=extra)) - else: - extra["status"] = "unresolved" - extra["target_label"] = raw_target - edges.append(_make_edge("references", source_note_id, raw_target, seq="body", occ=k, extra=extra)) - - # Chunks (Chunk-Ebene) - for i, ch in enumerate(chunks_payloads, start=1): - txt = ch.get("text") or ch.get("content") or "" - if not txt: + # --- Strukturkanten --- + # belongs_to + for ch in chunks: + cid = _get(ch, "chunk_id", "id") + if not cid: + # defensiv: überspringen statt Crash continue - occ = 0 - for m in WIKILINK_RE.finditer(txt): - occ += 1 - raw_target, heading, alias = m.groups() - target_id, how = resolve_target(raw_target, note_index) - extra = {"raw": raw_target, "alias": alias, "heading": heading, "resolution": how} - if target_id: - edges.append(_make_edge("references_at", ch["chunk_id"], target_id, seq=i, occ=occ, extra=extra)) - else: - extra["status"] = "unresolved" - extra["target_label"] = raw_target - edges.append(_make_edge("references_at", ch["chunk_id"], raw_target, seq=i, occ=occ, extra=extra)) + edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, {"chunk_id": cid})) + + # next/prev + for i in range(len(chunks) - 1): + a = chunks[i] + b = chunks[i + 1] + a_id = _get(a, "chunk_id", "id") + b_id = _get(b, "chunk_id", "id") + if not a_id or not b_id: + continue + edges.append(_edge("next", "chunk", a_id, b_id, note_id, {"chunk_id": a_id})) + edges.append(_edge("prev", "chunk", b_id, a_id, note_id, {"chunk_id": b_id})) + + # --- Referenzkanten (chunk-scope) --- + refs_all: List[str] = [] + for ch in chunks: + cid = _get(ch, "chunk_id", "id") + if not cid: + continue + txt = _chunk_text_for_refs(ch) + refs = _extract_wikilinks(txt) + if not refs: + continue + for r in refs: + edges.append(_edge("references", "chunk", cid, r, note_id, {"chunk_id": cid, "ref_text": r})) + refs_all.extend(refs) + + # --- Note-scope (optional) --- + if include_note_scope_refs: + # Inputs: dedup aller Chunk-Funde + optional vorhandene Note-Level-Refs aus Payload + refs_note = refs_all[:] + if note_level_references: + refs_note.extend([r for r in note_level_references if isinstance(r, str) and r]) + refs_note = _dedupe(refs_note) + + for r in refs_note: + # forward + edges.append(_edge("references", "note", note_id, r, note_id)) + # backlink (reverse) + edges.append(_edge("backlink", "note", r, note_id, note_id)) + return edges