All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
216 lines
6.7 KiB
Python
216 lines
6.7 KiB
Python
"""
|
|
chunk_payload.py — v1.4.2
|
|
-------------------------
|
|
Robuste, abwärtskompatible Payload-Erzeugung für Chunks.
|
|
|
|
Ziele
|
|
- Setzt pro Chunk `text`, `retriever_weight`, `chunk_profile`, `note_id`.
|
|
- Akzeptiert ParsedNote-Objekte *oder* Dicts, inklusive bereits vorsegmentierter .chunks.
|
|
- Verträgt zusätzliche args/kwargs (kompatibel zu älteren Aufrufern).
|
|
- Konfig-Auflösung identisch zu note_payload.py.
|
|
|
|
Autor: ChatGPT
|
|
Lizenz: MIT
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import hashlib
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Union
|
|
|
|
try:
|
|
import yaml # type: ignore
|
|
except Exception: # pragma: no cover
|
|
yaml = None # type: ignore
|
|
|
|
|
|
def _as_dict(note: Any) -> Dict[str, Any]:
|
|
if isinstance(note, dict):
|
|
return dict(note)
|
|
out: Dict[str, Any] = {}
|
|
for attr in ("note_id", "id", "title", "type", "frontmatter", "meta", "body", "text", "content", "path", "chunks"):
|
|
if hasattr(note, attr):
|
|
out[attr] = getattr(note, attr)
|
|
if hasattr(note, "__dict__"):
|
|
for k, v in note.__dict__.items():
|
|
if k not in out:
|
|
out[k] = v
|
|
return out
|
|
|
|
|
|
def _load_types_config(search_root: Optional[Union[str, Path]] = None,
|
|
preloaded: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
if isinstance(preloaded, dict) and "types" in preloaded:
|
|
return preloaded
|
|
|
|
candidates: List[Path] = []
|
|
if search_root:
|
|
root = Path(search_root)
|
|
candidates.extend([root / "config.yaml", root / "config" / "config.yaml", root / "config" / "types.yaml"])
|
|
cwd = Path.cwd()
|
|
candidates.extend([cwd / "config.yaml", cwd / "config" / "config.yaml", cwd / "config" / "types.yaml"])
|
|
|
|
for p in candidates:
|
|
if p.exists() and p.is_file():
|
|
if yaml is None:
|
|
break
|
|
try:
|
|
data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
|
|
if isinstance(data, dict) and "types" in data:
|
|
return data
|
|
except Exception:
|
|
pass
|
|
return {"version": "1.0", "types": {}}
|
|
|
|
|
|
def _safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
|
|
if not isinstance(d, dict):
|
|
return default
|
|
return d.get(key, default)
|
|
|
|
|
|
def _resolve_type(note_d: Dict[str, Any]) -> str:
|
|
fm = note_d.get("frontmatter") or {}
|
|
t = _safe_get(fm, "type") or note_d.get("type")
|
|
if not t and isinstance(note_d.get("meta"), dict):
|
|
t = note_d["meta"].get("type")
|
|
return str(t or "concept")
|
|
|
|
|
|
def _resolve_note_id(note_d: Dict[str, Any]) -> Optional[str]:
|
|
for k in ("note_id", "id"):
|
|
v = note_d.get(k)
|
|
if isinstance(v, str) and v:
|
|
return v
|
|
return None
|
|
|
|
|
|
def _resolve_body(note_d: Dict[str, Any]) -> str:
|
|
for k in ("body", "text", "content"):
|
|
v = note_d.get(k)
|
|
if isinstance(v, str) and v.strip():
|
|
return v
|
|
return ""
|
|
|
|
|
|
def _resolve_defaults_for_type(types_cfg: Dict[str, Any], typ: str) -> Dict[str, Any]:
|
|
if not isinstance(types_cfg, dict):
|
|
return {}
|
|
t = (types_cfg.get("types") or {}).get(typ) or {}
|
|
return t if isinstance(t, dict) else {}
|
|
|
|
|
|
def _coerce_float(val: Any, default: float) -> float:
|
|
try:
|
|
if val is None:
|
|
return default
|
|
if isinstance(val, (int, float)):
|
|
return float(val)
|
|
if isinstance(val, str):
|
|
return float(val.strip())
|
|
except Exception:
|
|
pass
|
|
return default
|
|
|
|
|
|
def _compute_retriever_weight(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> float:
|
|
fm = note_d.get("frontmatter") or {}
|
|
if "retriever_weight" in fm:
|
|
return _coerce_float(fm.get("retriever_weight"), 1.0)
|
|
tdef = _resolve_defaults_for_type(types_cfg, typ)
|
|
if "retriever_weight" in tdef:
|
|
return _coerce_float(tdef.get("retriever_weight"), 1.0)
|
|
envv = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT")
|
|
if envv:
|
|
return _coerce_float(envv, 1.0)
|
|
return 1.0
|
|
|
|
|
|
def _compute_chunk_profile(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> str:
|
|
fm = note_d.get("frontmatter") or {}
|
|
if "chunk_profile" in fm:
|
|
return str(fm.get("chunk_profile"))
|
|
tdef = _resolve_defaults_for_type(types_cfg, typ)
|
|
if "chunk_profile" in tdef:
|
|
return str(tdef.get("chunk_profile"))
|
|
envv = os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE")
|
|
if envv:
|
|
return str(envv)
|
|
return "medium"
|
|
|
|
|
|
def _norm_chunk_text(s: Any) -> str:
|
|
if isinstance(s, str):
|
|
return s.strip()
|
|
return ""
|
|
|
|
|
|
def _hash(s: str) -> str:
|
|
return hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]
|
|
|
|
|
|
def make_chunk_payloads(note: Any, *args, **kwargs) -> List[Dict[str, Any]]:
|
|
"""Erzeugt Payloads für alle Chunks der Note.
|
|
|
|
Akzeptierte zusätzliche kwargs:
|
|
- types_config: dict wie in config.yaml
|
|
- search_root / vault_root: für Konfigsuche
|
|
|
|
*args werden ignoriert (Kompatibilität zu älteren Aufrufern).
|
|
"""
|
|
note_d = _as_dict(note)
|
|
|
|
types_config = kwargs.get("types_config")
|
|
search_root = kwargs.get("search_root") or kwargs.get("vault_root")
|
|
types_cfg = _load_types_config(search_root, types_config)
|
|
|
|
typ = _resolve_type(note_d)
|
|
note_id = _resolve_note_id(note_d) or ""
|
|
|
|
r_weight = _compute_retriever_weight(note_d, types_cfg, typ)
|
|
c_profile = _compute_chunk_profile(note_d, types_cfg, typ)
|
|
|
|
out: List[Dict[str, Any]] = []
|
|
|
|
# 1) Falls der Parser bereits Chunks liefert, nutzen
|
|
pre = note_d.get("chunks")
|
|
if isinstance(pre, list) and pre:
|
|
for idx, c in enumerate(pre):
|
|
if isinstance(c, dict):
|
|
text = _norm_chunk_text(c.get("text") or c.get("body") or c.get("content"))
|
|
else:
|
|
text = _norm_chunk_text(getattr(c, "text", ""))
|
|
if not text:
|
|
# Fallback auf Note-Body, falls leer
|
|
text = _resolve_body(note_d)
|
|
if not text:
|
|
continue
|
|
|
|
chunk_id = f"{note_id}#{idx:03d}" if note_id else _hash(text)[:8]
|
|
payload = {
|
|
"note_id": note_id,
|
|
"chunk_id": chunk_id,
|
|
"text": text,
|
|
"retriever_weight": float(r_weight),
|
|
"chunk_profile": str(c_profile),
|
|
"type": typ,
|
|
}
|
|
out.append(payload)
|
|
|
|
# 2) Sonst als Single-Chunk aus Body/Text
|
|
if not out:
|
|
text = _resolve_body(note_d)
|
|
if text:
|
|
chunk_id = f"{note_id}#000" if note_id else _hash(text)[:8]
|
|
out.append({
|
|
"note_id": note_id,
|
|
"chunk_id": chunk_id,
|
|
"text": text,
|
|
"retriever_weight": float(r_weight),
|
|
"chunk_profile": str(c_profile),
|
|
"type": typ,
|
|
})
|
|
|
|
return out
|