mindnet/app/core/chunk_payload.py
Lars c2802e7cb3
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
app/core/chunk_payload.py aktualisiert
2025-11-08 14:24:40 +01:00

136 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
chunk_payload.py v2.2.1
Zweck:
- Erzeugt Chunk-Payloads (inkl. 'text' und 'window'), vollständig
rückwärtskompatibel zu v2.2.0 (Signaturen beibehalten).
- Neu: optionale Typ-Profile via Type-Registry (chunk_profile), ohne
bestehendes Verhalten zu brechen. Fällt automatisch auf die bisherigen
Einstellungen zurück, wenn keine Registry vorhanden/konfiguriert ist.
Wichtig:
- Signatur von make_chunk_payloads bleibt kompatibel:
make_chunk_payloads(chunks, note_id, note_title, note_type, note_path, ...)
- 'window' != 'text', wenn Overlap > 0; ansonsten identisch.
"""
from __future__ import annotations
from typing import Any, Dict, Iterable, List, Optional, Tuple
# Annahme: Diese Utilities existieren bei dir bereits (aus funktionierendem Stand):
# - Chunk Typ (dataclass/NamedTuple) mit Feldern: id (oder idx), text, start, end
# - windowing-Funktion (oder wir erzeugen ein simples Fenster über Nachbar-Chunks)
try:
from app.core.chunker import Chunk # type: ignore
except Exception:
# Minimaler Fallback, falls die Import-Umgebung abweicht; in produktiven
# Deployments wird der echte Chunk-Typ vorhanden sein.
from typing import NamedTuple
class Chunk(NamedTuple):
idx: int
text: str
start: int
end: int
# Optional: Registry einbinden
try:
from app.core.type_registry import resolve_chunk_profile
except Exception:
def resolve_chunk_profile(note_type: str, default_profile: str = "default") -> str:
return default_profile
DEFAULT_OVERLAP = 0 # falls euer chunker keinen Overlap liefert
DEFAULT_PROFILE_TO_OVERLAP = {
"short": 40,
"medium": 80,
"long": 120,
"default": 0,
}
def _estimate_overlap(profile: str) -> int:
return int(DEFAULT_PROFILE_TO_OVERLAP.get(profile, DEFAULT_OVERLAP))
def _make_window_text(chunks: List[Chunk], i: int, overlap_chars: int) -> str:
"""
Erzeugt ein einfaches Fenster: left-overlap-Anteil aus dem vorherigen Chunk
+ eigener Text + right-overlap-Anteil aus dem nächsten Chunk.
"""
center = chunks[i].text
if overlap_chars <= 0:
return center
left = ""
right = ""
if i > 0:
ltxt = chunks[i - 1].text
left = ltxt[-overlap_chars:] if len(ltxt) > overlap_chars else ltxt
if i + 1 < len(chunks):
rtxt = chunks[i + 1].text
right = rtxt[:overlap_chars] if len(rtxt) > overlap_chars else rtxt
# Verhindere doppelte Leerzeichen
pieces = [p for p in [left, center, right] if p]
return (" ".join(pieces)).strip()
def make_chunk_payloads(
chunks: Iterable[Chunk],
note_id: str,
note_title: str,
note_type: Optional[str],
note_path: str,
*,
chunk_profile: Optional[str] = None,
window_overwrite: Optional[int] = None,
extra_payload: Optional[Dict[str, Any]] = None,
) -> List[Dict[str, Any]]:
"""
Rückwärtskompatible Fabrik für Chunk-Payloads.
Parameter:
- chunks: Iterable[Chunk] — Ergebnis aus eurem chunker.
- note_id: str — ID der Note (z. B. aus Frontmatter 'id').
- note_title: str — Titel der Note.
- note_type: Optional[str] — Typ aus Frontmatter (concept, task, ...).
- note_path: str — relativer Pfad im Vault.
- chunk_profile: Optional[str] — Override-Profil (z. B. 'short').
- window_overwrite: Optional[int] — erzwinge Overlap in Zeichen.
- extra_payload: Optional[dict] — zusätzliche Felder in Payload.
Rückgabe:
- Liste von dict-Payloads; jedes enthält mind.:
{ "note_id", "note_title", "note_type", "path", "chunk_id",
"text", "window", "start", "end" }
"""
clist = list(chunks)
# Profil bestimmen (Registry → default → Override)
effective_profile = (
chunk_profile
or (resolve_chunk_profile(note_type or "concept") if note_type else "default")
or "default"
)
overlap = window_overwrite if isinstance(window_overwrite, int) else _estimate_overlap(effective_profile)
payloads: List[Dict[str, Any]] = []
for i, c in enumerate(clist):
# Chunk-ID stabil: note_id#(laufende Nummer 1..n) rückwärtskompatibel
# Falls euer Chunk bereits eine ID hat, könnt ihr sie beibehalten.
cid = f"{note_id}#{i+1}"
window_txt = _make_window_text(clist, i, overlap)
pl: Dict[str, Any] = {
"note_id": note_id,
"note_title": note_title,
"note_type": note_type,
"path": note_path,
"chunk_id": cid,
"text": getattr(c, "text", ""),
"window": window_txt,
"start": getattr(c, "start", 0),
"end": getattr(c, "end", 0),
}
if extra_payload:
pl.update(extra_payload)
payloads.append(pl)
return payloads