app/core/chunk_payload.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-11-09 10:11:34 +01:00
parent af36c410b4
commit 597090bc45

View File

@ -1,215 +1,199 @@
# chunk_payload.py
""" """
chunk_payload.py v1.4.2 Mindnet - Chunk Payload Builder
------------------------- Version: 1.4.3
Robuste, abwärtskompatible Payload-Erzeugung für Chunks. Beschreibung:
- Robust gegenüber alten/neuen Aufrufsignaturen (toleriert *args, **kwargs).
Ziele - Liest Typ-Defaults aus ./config/config.yaml oder ./config/types.yaml.
- Setzt pro Chunk `text`, `retriever_weight`, `chunk_profile`, `note_id`. - Baut Chunks aus vorhandenen note.chunks (falls vorhanden) oder fällt auf
- Akzeptiert ParsedNote-Objekte *oder* Dicts, inklusive bereits vorsegmentierter .chunks. eine einfache, profilabhängige Absatzbündelung zurück.
- Verträgt zusätzliche args/kwargs (kompatibel zu älteren Aufrufern). - Setzt in jedem Chunk-Payload:
- Konfig-Auflösung identisch zu note_payload.py. - note_id, chunk_id (deterministisch), index, title, type, path
- text (nie leer), retriever_weight, chunk_profile
Autor: ChatGPT - Garantiert JSON-serialisierbare Payloads.
Lizenz: MIT
""" """
from __future__ import annotations from __future__ import annotations
from typing import Any, Dict, List, Optional
import os import os
import json
import pathlib
import re
import yaml
import hashlib import hashlib
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
try:
import yaml # type: ignore
except Exception: # pragma: no cover
yaml = None # type: ignore
def _as_dict(note: Any) -> Dict[str, Any]: def _as_dict(note: Any) -> Dict[str, Any]:
if isinstance(note, dict): if isinstance(note, dict):
return dict(note) return note
out: Dict[str, Any] = {} d: Dict[str, Any] = {}
for attr in ("note_id", "id", "title", "type", "frontmatter", "meta", "body", "text", "content", "path", "chunks"): for attr in (
"id",
"note_id",
"title",
"path",
"frontmatter",
"meta",
"body",
"text",
"type",
"chunks",
):
if hasattr(note, attr): if hasattr(note, attr):
out[attr] = getattr(note, attr) d[attr] = getattr(note, attr)
if hasattr(note, "__dict__"): if "frontmatter" not in d and hasattr(note, "metadata"):
for k, v in note.__dict__.items(): d["frontmatter"] = getattr(note, "metadata")
if k not in out: return d
out[k] = v
return out
def _load_types_config(search_root: Optional[Union[str, Path]] = None, def _load_types_config(explicit: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
preloaded: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: if isinstance(explicit, dict):
if isinstance(preloaded, dict) and "types" in preloaded: return explicit
return preloaded for rel in ("config/config.yaml", "config/types.yaml"):
p = pathlib.Path(rel)
candidates: List[Path] = [] if p.exists():
if search_root: with p.open("r", encoding="utf-8") as f:
root = Path(search_root) data = yaml.safe_load(f) or {}
candidates.extend([root / "config.yaml", root / "config" / "config.yaml", root / "config" / "types.yaml"]) if isinstance(data, dict) and "types" in data and isinstance(data["types"], dict):
cwd = Path.cwd() return data["types"]
candidates.extend([cwd / "config.yaml", cwd / "config" / "config.yaml", cwd / "config" / "types.yaml"]) return data if isinstance(data, dict) else {}
return {}
for p in candidates:
if p.exists() and p.is_file():
if yaml is None:
break
try:
data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
if isinstance(data, dict) and "types" in data:
return data
except Exception:
pass
return {"version": "1.0", "types": {}}
def _safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any: def _get_front(n: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(d, dict): fm = n.get("frontmatter") or n.get("meta") or {}
return default return fm if isinstance(fm, dict) else {}
return d.get(key, default)
def _resolve_type(note_d: Dict[str, Any]) -> str: def _coalesce(*vals):
fm = note_d.get("frontmatter") or {} for v in vals:
t = _safe_get(fm, "type") or note_d.get("type") if v is not None:
if not t and isinstance(note_d.get("meta"), dict):
t = note_d["meta"].get("type")
return str(t or "concept")
def _resolve_note_id(note_d: Dict[str, Any]) -> Optional[str]:
for k in ("note_id", "id"):
v = note_d.get(k)
if isinstance(v, str) and v:
return v return v
return None return None
def _resolve_body(note_d: Dict[str, Any]) -> str: def _body(n: Dict[str, Any]) -> str:
for k in ("body", "text", "content"): b = n.get("body")
v = note_d.get(k) if isinstance(b, str):
if isinstance(v, str) and v.strip(): return b
return v t = n.get("text")
return "" return t if isinstance(t, str) else ""
def _resolve_defaults_for_type(types_cfg: Dict[str, Any], typ: str) -> Dict[str, Any]: def _iter_chunks(n: Dict[str, Any], profile: str) -> List[Dict[str, Any]]:
if not isinstance(types_cfg, dict): # 1) Bereits vorhandene Chunks bevorzugen
return {} existing = n.get("chunks")
t = (types_cfg.get("types") or {}).get(typ) or {} if isinstance(existing, list) and existing:
return t if isinstance(t, dict) else {} out: List[Dict[str, Any]] = []
for i, c in enumerate(existing):
if isinstance(c, dict):
text = c.get("text") or ""
else:
text = str(c) if c is not None else ""
if not text:
continue
out.append({"index": i, "text": text})
if out:
return out
# 2) Fallback: naive, profilabhängige Absatz-Bündelung
def _coerce_float(val: Any, default: float) -> float: size = {"short": 600, "medium": 1200, "long": 2400}.get(str(profile), 1200)
try: text = _body(n)
if val is None: if not text:
return default return []
if isinstance(val, (int, float)): paras = re.split(r"\n{2,}", text)
return float(val) chunks: List[str] = []
if isinstance(val, str): buf = ""
return float(val.strip()) for p in paras:
except Exception: p = (p or "").strip()
pass if not p:
return default continue
if len(buf) + (2 if buf else 0) + len(p) <= size:
buf = (buf + "\n\n" + p).strip() if buf else p
def _compute_retriever_weight(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> float: else:
fm = note_d.get("frontmatter") or {} if buf:
if "retriever_weight" in fm: chunks.append(buf)
return _coerce_float(fm.get("retriever_weight"), 1.0) if len(p) <= size:
tdef = _resolve_defaults_for_type(types_cfg, typ) buf = p
if "retriever_weight" in tdef: else:
return _coerce_float(tdef.get("retriever_weight"), 1.0) for i in range(0, len(p), size):
envv = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT") chunks.append(p[i : i + size])
if envv: buf = ""
return _coerce_float(envv, 1.0) if buf:
return 1.0 chunks.append(buf)
return [{"index": i, "text": c} for i, c in enumerate(chunks)]
def _compute_chunk_profile(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> str:
fm = note_d.get("frontmatter") or {}
if "chunk_profile" in fm:
return str(fm.get("chunk_profile"))
tdef = _resolve_defaults_for_type(types_cfg, typ)
if "chunk_profile" in tdef:
return str(tdef.get("chunk_profile"))
envv = os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE")
if envv:
return str(envv)
return "medium"
def _norm_chunk_text(s: Any) -> str:
if isinstance(s, str):
return s.strip()
return ""
def _hash(s: str) -> str:
return hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]
def make_chunk_payloads(note: Any, *args, **kwargs) -> List[Dict[str, Any]]: def make_chunk_payloads(note: Any, *args, **kwargs) -> List[Dict[str, Any]]:
"""Erzeugt Payloads für alle Chunks der Note.
Akzeptierte zusätzliche kwargs:
- types_config: dict wie in config.yaml
- search_root / vault_root: für Konfigsuche
*args werden ignoriert (Kompatibilität zu älteren Aufrufern).
""" """
note_d = _as_dict(note) Build payloads for chunks. Tolerates legacy positional arguments.
Returns list[dict] (ein Payload pro Chunk).
"""
n = _as_dict(note)
types_cfg = kwargs.get("types_config") or (args[0] if args else None)
types_cfg = _load_types_config(types_cfg)
types_config = kwargs.get("types_config") fm = _get_front(n)
search_root = kwargs.get("search_root") or kwargs.get("vault_root") note_type = str(fm.get("type") or n.get("type") or "note")
types_cfg = _load_types_config(search_root, types_config) cfg_for_type = types_cfg.get(note_type, {}) if isinstance(types_cfg, dict) else {}
typ = _resolve_type(note_d) try:
note_id = _resolve_note_id(note_d) or "" default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
except Exception:
default_rw = 1.0
r_weight = _compute_retriever_weight(note_d, types_cfg, typ) retriever_weight = _coalesce(
c_profile = _compute_chunk_profile(note_d, types_cfg, typ) fm.get("retriever_weight"),
cfg_for_type.get("retriever_weight"),
default_rw,
)
try:
retriever_weight = float(retriever_weight)
except Exception:
retriever_weight = default_rw
out: List[Dict[str, Any]] = [] chunk_profile = _coalesce(
fm.get("chunk_profile"),
cfg_for_type.get("chunk_profile"),
os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE", "medium"),
)
if not isinstance(chunk_profile, str):
chunk_profile = "medium"
# 1) Falls der Parser bereits Chunks liefert, nutzen note_id = n.get("note_id") or n.get("id") or fm.get("id")
pre = note_d.get("chunks") title = n.get("title") or fm.get("title") or ""
if isinstance(pre, list) and pre: path = n.get("path")
for idx, c in enumerate(pre): if isinstance(path, pathlib.Path):
if isinstance(c, dict): path = str(path)
text = _norm_chunk_text(c.get("text") or c.get("body") or c.get("content"))
else:
text = _norm_chunk_text(getattr(c, "text", ""))
if not text:
# Fallback auf Note-Body, falls leer
text = _resolve_body(note_d)
if not text:
continue
chunk_id = f"{note_id}#{idx:03d}" if note_id else _hash(text)[:8] chunks = _iter_chunks(n, chunk_profile)
payload = {
"note_id": note_id,
"chunk_id": chunk_id,
"text": text,
"retriever_weight": float(r_weight),
"chunk_profile": str(c_profile),
"type": typ,
}
out.append(payload)
# 2) Sonst als Single-Chunk aus Body/Text payloads: List[Dict[str, Any]] = []
if not out: for c in chunks:
text = _resolve_body(note_d) idx = c.get("index", len(payloads))
if text: text = c.get("text") if isinstance(c, dict) else (str(c) if c is not None else "")
chunk_id = f"{note_id}#000" if note_id else _hash(text)[:8] if not isinstance(text, str):
out.append({ text = str(text or "")
"note_id": note_id,
"chunk_id": chunk_id,
"text": text,
"retriever_weight": float(r_weight),
"chunk_profile": str(c_profile),
"type": typ,
})
return out # deterministische chunk_id
key = f"{note_id}|{idx}"
h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:12]
chunk_id = f"{note_id}-{idx:03d}-{h}" if note_id else h
payload = {
"note_id": note_id,
"chunk_id": chunk_id,
"index": idx,
"title": title,
"type": note_type,
"path": path,
"text": text,
"retriever_weight": retriever_weight,
"chunk_profile": chunk_profile,
}
# JSON-Serialisierbarkeit sicherstellen
json.loads(json.dumps(payload, ensure_ascii=False))
payloads.append(payload)
return payloads