All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
136 lines
4.7 KiB
Python
136 lines
4.7 KiB
Python
"""
|
||
chunk_payload.py v2.2.1
|
||
|
||
Zweck:
|
||
- Erzeugt Chunk-Payloads (inkl. 'text' und 'window'), vollständig
|
||
rückwärtskompatibel zu v2.2.0 (Signaturen beibehalten).
|
||
- Neu: optionale Typ-Profile via Type-Registry (chunk_profile), ohne
|
||
bestehendes Verhalten zu brechen. Fällt automatisch auf die bisherigen
|
||
Einstellungen zurück, wenn keine Registry vorhanden/konfiguriert ist.
|
||
|
||
Wichtig:
|
||
- Signatur von make_chunk_payloads bleibt kompatibel:
|
||
make_chunk_payloads(chunks, note_id, note_title, note_type, note_path, ...)
|
||
- 'window' != 'text', wenn Overlap > 0; ansonsten identisch.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||
|
||
# Annahme: Diese Utilities existieren bei dir bereits (aus funktionierendem Stand):
|
||
# - Chunk Typ (dataclass/NamedTuple) mit Feldern: id (oder idx), text, start, end
|
||
# - windowing-Funktion (oder wir erzeugen ein simples Fenster über Nachbar-Chunks)
|
||
try:
|
||
from app.core.chunker import Chunk # type: ignore
|
||
except Exception:
|
||
# Minimaler Fallback, falls die Import-Umgebung abweicht; in produktiven
|
||
# Deployments wird der echte Chunk-Typ vorhanden sein.
|
||
from typing import NamedTuple
|
||
class Chunk(NamedTuple):
|
||
idx: int
|
||
text: str
|
||
start: int
|
||
end: int
|
||
|
||
# Optional: Registry einbinden
|
||
try:
|
||
from app.core.type_registry import resolve_chunk_profile
|
||
except Exception:
|
||
def resolve_chunk_profile(note_type: str, default_profile: str = "default") -> str:
|
||
return default_profile
|
||
|
||
DEFAULT_OVERLAP = 0 # falls euer chunker keinen Overlap liefert
|
||
DEFAULT_PROFILE_TO_OVERLAP = {
|
||
"short": 40,
|
||
"medium": 80,
|
||
"long": 120,
|
||
"default": 0,
|
||
}
|
||
|
||
def _estimate_overlap(profile: str) -> int:
|
||
return int(DEFAULT_PROFILE_TO_OVERLAP.get(profile, DEFAULT_OVERLAP))
|
||
|
||
def _make_window_text(chunks: List[Chunk], i: int, overlap_chars: int) -> str:
|
||
"""
|
||
Erzeugt ein einfaches Fenster: left-overlap-Anteil aus dem vorherigen Chunk
|
||
+ eigener Text + right-overlap-Anteil aus dem nächsten Chunk.
|
||
"""
|
||
center = chunks[i].text
|
||
if overlap_chars <= 0:
|
||
return center
|
||
|
||
left = ""
|
||
right = ""
|
||
if i > 0:
|
||
ltxt = chunks[i - 1].text
|
||
left = ltxt[-overlap_chars:] if len(ltxt) > overlap_chars else ltxt
|
||
if i + 1 < len(chunks):
|
||
rtxt = chunks[i + 1].text
|
||
right = rtxt[:overlap_chars] if len(rtxt) > overlap_chars else rtxt
|
||
|
||
# Verhindere doppelte Leerzeichen
|
||
pieces = [p for p in [left, center, right] if p]
|
||
return (" ".join(pieces)).strip()
|
||
|
||
def make_chunk_payloads(
|
||
chunks: Iterable[Chunk],
|
||
note_id: str,
|
||
note_title: str,
|
||
note_type: Optional[str],
|
||
note_path: str,
|
||
*,
|
||
chunk_profile: Optional[str] = None,
|
||
window_overwrite: Optional[int] = None,
|
||
extra_payload: Optional[Dict[str, Any]] = None,
|
||
) -> List[Dict[str, Any]]:
|
||
"""
|
||
Rückwärtskompatible Fabrik für Chunk-Payloads.
|
||
|
||
Parameter:
|
||
- chunks: Iterable[Chunk] — Ergebnis aus eurem chunker.
|
||
- note_id: str — ID der Note (z. B. aus Frontmatter 'id').
|
||
- note_title: str — Titel der Note.
|
||
- note_type: Optional[str] — Typ aus Frontmatter (concept, task, ...).
|
||
- note_path: str — relativer Pfad im Vault.
|
||
- chunk_profile: Optional[str] — Override-Profil (z. B. 'short').
|
||
- window_overwrite: Optional[int] — erzwinge Overlap in Zeichen.
|
||
- extra_payload: Optional[dict] — zusätzliche Felder in Payload.
|
||
|
||
Rückgabe:
|
||
- Liste von dict-Payloads; jedes enthält mind.:
|
||
{ "note_id", "note_title", "note_type", "path", "chunk_id",
|
||
"text", "window", "start", "end" }
|
||
"""
|
||
clist = list(chunks)
|
||
# Profil bestimmen (Registry → default → Override)
|
||
effective_profile = (
|
||
chunk_profile
|
||
or (resolve_chunk_profile(note_type or "concept") if note_type else "default")
|
||
or "default"
|
||
)
|
||
overlap = window_overwrite if isinstance(window_overwrite, int) else _estimate_overlap(effective_profile)
|
||
|
||
payloads: List[Dict[str, Any]] = []
|
||
for i, c in enumerate(clist):
|
||
# Chunk-ID stabil: note_id#(laufende Nummer 1..n) – rückwärtskompatibel
|
||
# Falls euer Chunk bereits eine ID hat, könnt ihr sie beibehalten.
|
||
cid = f"{note_id}#{i+1}"
|
||
window_txt = _make_window_text(clist, i, overlap)
|
||
|
||
pl: Dict[str, Any] = {
|
||
"note_id": note_id,
|
||
"note_title": note_title,
|
||
"note_type": note_type,
|
||
"path": note_path,
|
||
"chunk_id": cid,
|
||
"text": getattr(c, "text", ""),
|
||
"window": window_txt,
|
||
"start": getattr(c, "start", 0),
|
||
"end": getattr(c, "end", 0),
|
||
}
|
||
if extra_payload:
|
||
pl.update(extra_payload)
|
||
payloads.append(pl)
|
||
|
||
return payloads
|