mindnet/app/core/chunk_payload.py
Lars 5279bcae18
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
app/core/chunk_payload.py aktualisiert
2025-09-30 12:18:16 +02:00

202 lines
7.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Modul: app/core/chunk_payload.py
Version: 2.0.0
Datum: 2025-09-30
Zweck
-----
Erzeugt Chunk-Payloads für Qdrant. Unterstützt abwärtskompatibel bisherige Felder und
ergänzt neue Felder für **verlustfreie Rekonstruktion** bei überlappenden Fenstern:
- text : effektiver, nicht-überlappender Segmenttext (für Rekonstruktion)
- window : Fenstertext inkl. Overlap (für Embeddings)
- start, end : absolute Offsets (0-basiert) des effektiven Segments im Gesamtkorpus
- overlap_left : Anzahl überlappender Zeichen zum **vorigen** Fenster
- overlap_right : Anzahl überlappender Zeichen zum **nächsten** Fenster
Abwärtskompatibel bleiben:
- chunk_id (note_id#<n>), chunk_index, seq, path, note_id, type, title, tags, etc.
Aufruf (typisch aus dem Importer)
---------------------------------
from app.core.chunk_payload import make_chunk_payloads
payloads = make_chunk_payloads(frontmatter, rel_path, chunks, note_text=full_body)
Wobei `chunks` eine Folge von Objekten oder Dicts ist, die mindestens ein Fenster enthalten:
c.text ODER c.content ODER c.raw (falls als Objekt)
bzw. c["text"] ODER c["content"] ODER c["raw"] (falls Dict)
Falls `note_text` nicht übergeben wird, wird die effektive Segmentierung über
eine robuste **Overlap-Deduplikation** zwischen Fenstern ermittelt.
"""
from __future__ import annotations
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import re
# ------------------------------- Utils ------------------------------- #
def _as_text(window_candidate: Any) -> str:
"""Extrahiert Fenstertext aus beliebigem Chunk-Objekt/Dikt."""
if window_candidate is None:
return ""
# Objekt mit Attributen
for k in ("text", "content", "raw", "window"):
v = getattr(window_candidate, k, None) if not isinstance(window_candidate, dict) else window_candidate.get(k)
if isinstance(v, str) and v:
return v
# Fallback: string-repr
if isinstance(window_candidate, str):
return window_candidate
return ""
def _get_int(x: Any, default: int = 0) -> int:
try:
return int(x)
except Exception:
return default
def _norm_lines(s: str) -> str:
"""Nur für defensive Gleichheitstests NICHT für Persistenz."""
return "\n".join([ln.rstrip() for ln in s.replace("\r\n", "\n").replace("\r", "\n").split("\n")]).strip()
# ---------------------- Overlap-Dedupe Algorithmus ------------------- #
def _dedupe_windows_to_segments(windows: List[str]) -> Tuple[List[str], List[int]]:
"""
Ermittelt nicht-überlappende Segmente zu einer geordneten Folge von Fenster-Strings.
Gibt (segments, overlaps_left) zurück, wobei:
- segments[i] = Fenster[i] ohne das vorangestellte Overlap
- overlaps_left[i] = Länge des Overlaps von Fenster[i] zum bisher rekonstruierten Text
"""
segments: List[str] = []
overlaps_left: List[int] = []
reconstructed = ""
for w in windows:
w = w or ""
# finde größtes k, sodass reconstructed.endswith(w[:k])
max_k = min(len(w), max(0, len(reconstructed)))
k = 0
# Suche von groß nach klein (einfache O(n^2) ausreichend bei kurzen Fenstern)
for cand in range(max_k, -1, -1):
if reconstructed.endswith(w[:cand]):
k = cand
break
seg = w[k:]
segments.append(seg)
overlaps_left.append(k)
reconstructed += seg
return segments, overlaps_left
# ----------------------------- Public API ---------------------------- #
def make_chunk_payloads(
frontmatter: Dict[str, Any],
rel_path: str,
chunks: Iterable[Union[Dict[str, Any], Any]],
note_text: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Baut Payload-Dicts pro Chunk.
Parameter
---------
frontmatter : dict erwartete Keys: id (note_id), title, type, tags (optional)
rel_path : str relativer Pfad der Note im Vault
chunks : iter Sequenz von Chunk-Objekten/-Dicts mit Fenstertext
note_text : str? optionaler Gesamtkorpus (Body) für exakte Offsets
Rückgabe
--------
Liste von Payload-Dicts. Wichtige Felder:
note_id, chunk_id, chunk_index, seq, path, text, window, start, end,
overlap_left, overlap_right, type, title, tags
"""
note_id = str(frontmatter.get("id") or "").strip()
note_type = frontmatter.get("type", None)
note_title = frontmatter.get("title", None)
note_tags = frontmatter.get("tags", None)
# 1) Fenstertexte extrahieren
windows: List[str] = []
seqs: List[int] = []
for idx, c in enumerate(chunks):
windows.append(_as_text(c))
# Bestmögliche seq ermitteln
s = None
if isinstance(c, dict):
s = c.get("seq", c.get("chunk_index", idx))
else:
s = getattr(c, "seq", getattr(c, "chunk_index", idx))
seqs.append(_get_int(s, idx))
# 2) Nicht-überlappende Segmente berechnen
segments, overlaps_left = _dedupe_windows_to_segments(windows)
overlaps_right = [0] * len(segments)
# right-overlap ist der left-overlap des nächsten Fensters bezogen auf dessen Fenster,
# lässt sich nur approximieren; wir speichern ihn konsistent als 0 bzw. könnte man
# nachträglich bestimmen, falls benötigt.
# 3) Falls note_text gegeben ist, berechne absolute Offsets präzise
starts: List[int] = [0] * len(segments)
ends: List[int] = [0] * len(segments)
if isinstance(note_text, str):
pos = 0
for i, seg in enumerate(segments):
starts[i] = pos
pos += len(seg)
ends[i] = pos
else:
# Ohne Gesamtkorpus: Offsets anhand der kumulativen Segmentlängen
pos = 0
for i, seg in enumerate(segments):
starts[i] = pos
pos += len(seg)
ends[i] = pos
# 4) Payload-Dicts aufbauen
payloads: List[Dict[str, Any]] = []
for i, (win, seg) in enumerate(zip(windows, segments)):
pl: Dict[str, Any] = {
"note_id": note_id,
"chunk_id": f"{note_id}#{i+1}",
"chunk_index": i,
"seq": seqs[i],
"path": rel_path.replace("\\", "/").lstrip("/"),
# Texte
"window": win, # für Embeddings (inkl. Overlap)
"text": seg, # effektiver Anteil (verlustfreie Rekonstruktion)
# Offsets & Overlaps
"start": starts[i],
"end": ends[i],
"overlap_left": overlaps_left[i],
"overlap_right": overlaps_right[i],
}
if note_type is not None:
pl["type"] = note_type
if note_title is not None:
pl["title"] = note_title
if note_tags is not None:
pl["tags"] = note_tags
payloads.append(pl)
return payloads
# __main__ (optionaler Mini-Test)
if __name__ == "__main__": # pragma: no cover
demo_fm = {"id": "demo", "title": "Demo", "type": "concept"}
demo_chunks = [
{"text": "Alpha Beta Gamma"},
{"text": "Gamma Delta"},
{"text": "Delta Epsilon Zeta"},
]
pls = make_chunk_payloads(demo_fm, "x/demo.md", demo_chunks, note_text="Alpha Beta Gamma Delta Epsilon Zeta")
from pprint import pprint
pprint(pls)
recon = "".join(p["text"] for p in pls)
print("RECON:", recon)