mindnet/app/core/chunk_payload.py
Lars 3c67fd5f9b
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
app/core/chunk_payload.py aktualisiert
2025-09-30 12:26:33 +02:00

194 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Modul: app/core/chunk_payload.py
Version: 2.0.1
Datum: 2025-09-30
Zweck
-----
Erzeugt Chunk-Payloads für Qdrant. Unterstützt abwärtskompatibel bisherige Felder und
ergänzt Felder für **verlustfreie Rekonstruktion** bei überlappenden Fenstern:
- text : effektiver, nicht-überlappender Segmenttext (für Rekonstruktion)
- window : Fenstertext inkl. Overlap (für Embeddings)
- start, end : absolute Offsets (0-basiert) des effektiven Segments im Gesamtkorpus
- overlap_left : Anzahl überlappender Zeichen zum **vorigen** Fenster
- overlap_right : Anzahl überlappender Zeichen zum **nächsten** Fenster
Abwärtskompatible Aliasse:
- id : == chunk_id (wird u. a. von build_edges_for_note erwartet)
- content/raw : bleiben leer; Primärfelder sind window/text
Typische Nutzung
----------------
from app.core.chunk_payload import make_chunk_payloads
payloads = make_chunk_payloads(frontmatter, rel_path, chunks, note_text=full_body)
`chunks` ist eine Sequenz von Objekten oder Dicts, die mindestens ein Fenster enthalten:
c.text ODER c.content ODER c.raw (falls Objekt)
bzw. c["text"] ODER c["content"] ODER c["raw"] (falls Dict)
"""
from __future__ import annotations
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
# ------------------------------- Utils ------------------------------- #
def _as_text(window_candidate: Any) -> str:
"""Extrahiert Fenstertext aus beliebigem Chunk-Objekt/Dikt."""
if window_candidate is None:
return ""
# Objekt mit Attributen
if not isinstance(window_candidate, dict):
for k in ("window", "text", "content", "raw"):
v = getattr(window_candidate, k, None)
if isinstance(v, str) and v:
return v
else:
for k in ("window", "text", "content", "raw"):
v = window_candidate.get(k)
if isinstance(v, str) and v:
return v
# Fallback: string-repr
if isinstance(window_candidate, str):
return window_candidate
return ""
def _get_int(x: Any, default: int = 0) -> int:
try:
return int(x)
except Exception:
return default
# ---------------------- Overlap-Dedupe Algorithmus ------------------- #
def _dedupe_windows_to_segments(windows: List[str]) -> Tuple[List[str], List[int]]:
"""
Ermittelt nicht-überlappende Segmente zu einer geordneten Folge von Fenster-Strings.
Gibt (segments, overlaps_left) zurück, wobei:
- segments[i] = Fenster[i] ohne das vorangestellte Overlap
- overlaps_left[i] = Länge des Overlaps von Fenster[i] zum bisher rekonstruierten Text
"""
segments: List[str] = []
overlaps_left: List[int] = []
reconstructed = ""
for w in windows:
w = w or ""
max_k = min(len(w), len(reconstructed))
k = 0
# Suche von groß nach klein (einfach, ausreichend bei kurzen Fenstern)
for cand in range(max_k, -1, -1):
if reconstructed.endswith(w[:cand]):
k = cand
break
seg = w[k:]
segments.append(seg)
overlaps_left.append(k)
reconstructed += seg
return segments, overlaps_left
# ----------------------------- Public API ---------------------------- #
def make_chunk_payloads(
frontmatter: Dict[str, Any],
rel_path: str,
chunks: Iterable[Union[Dict[str, Any], Any]],
note_text: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Baut Payload-Dicts pro Chunk.
Parameter
---------
frontmatter : dict erwartete Keys: id (note_id), title, type, tags (optional)
rel_path : str relativer Pfad der Note im Vault
chunks : iter Sequenz von Chunk-Objekten/-Dicts mit Fenstertext
note_text : str? optionaler Gesamtkorpus (Body) für exakte Offsets
Rückgabe
--------
Liste von Payload-Dicts. Wichtige Felder:
note_id, id, chunk_id, chunk_index, seq, path, text, window,
start, end, overlap_left, overlap_right, type, title, tags
"""
note_id = str(frontmatter.get("id") or "").strip()
note_type = frontmatter.get("type", None)
note_title = frontmatter.get("title", None)
note_tags = frontmatter.get("tags", None)
# 1) Fenstertexte + Sequenzen extrahieren
windows: List[str] = []
seqs: List[int] = []
for idx, c in enumerate(chunks):
windows.append(_as_text(c))
if isinstance(c, dict):
s = c.get("seq", c.get("chunk_index", idx))
else:
s = getattr(c, "seq", getattr(c, "chunk_index", idx))
seqs.append(_get_int(s, idx))
# 2) Nicht-überlappende Segmente berechnen
segments, overlaps_left = _dedupe_windows_to_segments(windows)
overlaps_right = [0] * len(segments) # optional: später präzisieren
# 3) Offsets bestimmen (ohne/mit note_text gleich: kumulativ)
starts: List[int] = [0] * len(segments)
ends: List[int] = [0] * len(segments)
pos = 0
for i, seg in enumerate(segments):
starts[i] = pos
pos += len(seg)
ends[i] = pos
# 4) Payload-Dicts zusammenstellen
payloads: List[Dict[str, Any]] = []
for i, (win, seg) in enumerate(zip(windows, segments)):
chunk_id = f"{note_id}#{i+1}"
pl: Dict[str, Any] = {
# Identität
"note_id": note_id,
"chunk_id": chunk_id,
"id": chunk_id, # <— WICHTIG: Alias für Abwärtskompatibilität (Edges erwarten 'id')
# Indexierung
"chunk_index": i,
"seq": seqs[i],
"path": rel_path.replace("\\", "/").lstrip("/"),
# Texte
"window": win, # für Embeddings (inkl. Overlap)
"text": seg, # überlappungsfreier Anteil für exakte Rekonstruktion
# Offsets & Overlaps
"start": starts[i],
"end": ends[i],
"overlap_left": overlaps_left[i],
"overlap_right": overlaps_right[i],
}
if note_type is not None:
pl["type"] = note_type
if note_title is not None:
pl["title"] = note_title
if note_tags is not None:
pl["tags"] = note_tags
payloads.append(pl)
return payloads
# __main__ (optional: Mini-Demo)
if __name__ == "__main__": # pragma: no cover
demo_fm = {"id": "demo", "title": "Demo", "type": "concept"}
demo_chunks = [
{"text": "Alpha Beta Gamma"},
{"text": "Gamma Delta"},
{"text": "Delta Epsilon Zeta"},
]
pls = make_chunk_payloads(demo_fm, "x/demo.md", demo_chunks, note_text="Alpha Beta Gamma Delta Epsilon Zeta")
from pprint import pprint
pprint(pls)
recon = "".join(p["text"] for p in pls)
print("RECON:", recon)