app/core/chunk_payload.py aktualisiert

2025-11-09 10:11:34 +01:00 · 2025-11-09 10:11:34 +01:00 · 597090bc45
commit 597090bc45
parent af36c410b4
1 changed files with 164 additions and 180 deletions
--- a/app/core/chunk_payload.py
+++ b/app/core/chunk_payload.py
@ -1,215 +1,199 @@
 # chunk_payload.py
 """
-chunk_payload.py — v1.4.2
+Mindnet - Chunk Payload Builder
-------------------------
+Version: 1.4.3
-Robuste, abwärtskompatible Payload-Erzeugung für Chunks.
+Beschreibung:
-
+- Robust gegenüber alten/neuen Aufrufsignaturen (toleriert *args, **kwargs).
-Ziele
+- Liest Typ-Defaults aus ./config/config.yaml oder ./config/types.yaml.
- Setzt pro Chunk `text`, `retriever_weight`, `chunk_profile`, `note_id`.
+- Baut Chunks aus vorhandenen note.chunks (falls vorhanden) oder fällt auf
- Akzeptiert ParsedNote-Objekte *oder* Dicts, inklusive bereits vorsegmentierter .chunks.
+  eine einfache, profilabhängige Absatzbündelung zurück.
- Verträgt zusätzliche args/kwargs (kompatibel zu älteren Aufrufern).
+- Setzt in jedem Chunk-Payload:
- Konfig-Auflösung identisch zu note_payload.py.
+  - note_id, chunk_id (deterministisch), index, title, type, path
-
+  - text (nie leer), retriever_weight, chunk_profile
-Autor: ChatGPT
+- Garantiert JSON-serialisierbare Payloads.
 Lizenz: MIT
 """
 from __future__ import annotations
-
+from typing import Any, Dict, List, Optional
 import os
 import json
 import pathlib
 import re
 import yaml
 import hashlib
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 try:
    import yaml  # type: ignore
 except Exception:  # pragma: no cover
    yaml = None  # type: ignore
 def _as_dict(note: Any) -> Dict[str, Any]:
    if isinstance(note, dict):
-        return dict(note)
+        return note
-    out: Dict[str, Any] = {}
+    d: Dict[str, Any] = {}
-    for attr in ("note_id", "id", "title", "type", "frontmatter", "meta", "body", "text", "content", "path", "chunks"):
+    for attr in (
        "id",
        "note_id",
        "title",
        "path",
        "frontmatter",
        "meta",
        "body",
        "text",
        "type",
        "chunks",
    ):
        if hasattr(note, attr):
-            out[attr] = getattr(note, attr)
+            d[attr] = getattr(note, attr)
-    if hasattr(note, "__dict__"):
+    if "frontmatter" not in d and hasattr(note, "metadata"):
-        for k, v in note.__dict__.items():
+        d["frontmatter"] = getattr(note, "metadata")
-            if k not in out:
+    return d
                out[k] = v
    return out
-def _load_types_config(search_root: Optional[Union[str, Path]] = None,
+def _load_types_config(explicit: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
-                       preloaded: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    if isinstance(explicit, dict):
-    if isinstance(preloaded, dict) and "types" in preloaded:
+        return explicit
-        return preloaded
+    for rel in ("config/config.yaml", "config/types.yaml"):
-
+        p = pathlib.Path(rel)
-    candidates: List[Path] = []
+        if p.exists():
-    if search_root:
+            with p.open("r", encoding="utf-8") as f:
-        root = Path(search_root)
+                data = yaml.safe_load(f) or {}
-        candidates.extend([root / "config.yaml", root / "config" / "config.yaml", root / "config" / "types.yaml"])
+            if isinstance(data, dict) and "types" in data and isinstance(data["types"], dict):
-    cwd = Path.cwd()
+                return data["types"]
-    candidates.extend([cwd / "config.yaml", cwd / "config" / "config.yaml", cwd / "config" / "types.yaml"])
+            return data if isinstance(data, dict) else {}
-
+    return {}
    for p in candidates:
        if p.exists() and p.is_file():
            if yaml is None:
                break
            try:
                data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
                if isinstance(data, dict) and "types" in data:
                    return data
            except Exception:
                pass
    return {"version": "1.0", "types": {}}
-def _safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
+def _get_front(n: Dict[str, Any]) -> Dict[str, Any]:
-    if not isinstance(d, dict):
+    fm = n.get("frontmatter") or n.get("meta") or {}
-        return default
+    return fm if isinstance(fm, dict) else {}
    return d.get(key, default)
-def _resolve_type(note_d: Dict[str, Any]) -> str:
+def _coalesce(*vals):
-    fm = note_d.get("frontmatter") or {}
+    for v in vals:
-    t = _safe_get(fm, "type") or note_d.get("type")
+        if v is not None:
    if not t and isinstance(note_d.get("meta"), dict):
        t = note_d["meta"].get("type")
    return str(t or "concept")
 def _resolve_note_id(note_d: Dict[str, Any]) -> Optional[str]:
    for k in ("note_id", "id"):
        v = note_d.get(k)
        if isinstance(v, str) and v:
            return v
    return None
-def _resolve_body(note_d: Dict[str, Any]) -> str:
+def _body(n: Dict[str, Any]) -> str:
-    for k in ("body", "text", "content"):
+    b = n.get("body")
-        v = note_d.get(k)
+    if isinstance(b, str):
-        if isinstance(v, str) and v.strip():
+        return b
-            return v
+    t = n.get("text")
-    return ""
+    return t if isinstance(t, str) else ""
-def _resolve_defaults_for_type(types_cfg: Dict[str, Any], typ: str) -> Dict[str, Any]:
+def _iter_chunks(n: Dict[str, Any], profile: str) -> List[Dict[str, Any]]:
-    if not isinstance(types_cfg, dict):
+    # 1) Bereits vorhandene Chunks bevorzugen
-        return {}
+    existing = n.get("chunks")
-    t = (types_cfg.get("types") or {}).get(typ) or {}
+    if isinstance(existing, list) and existing:
-    return t if isinstance(t, dict) else {}
+        out: List[Dict[str, Any]] = []
        for i, c in enumerate(existing):
            if isinstance(c, dict):
                text = c.get("text") or ""
            else:
                text = str(c) if c is not None else ""
            if not text:
                continue
            out.append({"index": i, "text": text})
        if out:
            return out
-
+    # 2) Fallback: naive, profilabhängige Absatz-Bündelung
-def _coerce_float(val: Any, default: float) -> float:
+    size = {"short": 600, "medium": 1200, "long": 2400}.get(str(profile), 1200)
-    try:
+    text = _body(n)
-        if val is None:
+    if not text:
-            return default
+        return []
-        if isinstance(val, (int, float)):
+    paras = re.split(r"\n{2,}", text)
-            return float(val)
+    chunks: List[str] = []
-        if isinstance(val, str):
+    buf = ""
-            return float(val.strip())
+    for p in paras:
-    except Exception:
+        p = (p or "").strip()
-        pass
+        if not p:
-    return default
+            continue
-
+        if len(buf) + (2 if buf else 0) + len(p) <= size:
-
+            buf = (buf + "\n\n" + p).strip() if buf else p
-def _compute_retriever_weight(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> float:
+        else:
-    fm = note_d.get("frontmatter") or {}
+            if buf:
-    if "retriever_weight" in fm:
+                chunks.append(buf)
-        return _coerce_float(fm.get("retriever_weight"), 1.0)
+            if len(p) <= size:
-    tdef = _resolve_defaults_for_type(types_cfg, typ)
+                buf = p
-    if "retriever_weight" in tdef:
+            else:
-        return _coerce_float(tdef.get("retriever_weight"), 1.0)
+                for i in range(0, len(p), size):
-    envv = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT")
+                    chunks.append(p[i : i + size])
-    if envv:
+                buf = ""
-        return _coerce_float(envv, 1.0)
+    if buf:
-    return 1.0
+        chunks.append(buf)
-
+    return [{"index": i, "text": c} for i, c in enumerate(chunks)]
 def _compute_chunk_profile(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> str:
    fm = note_d.get("frontmatter") or {}
    if "chunk_profile" in fm:
        return str(fm.get("chunk_profile"))
    tdef = _resolve_defaults_for_type(types_cfg, typ)
    if "chunk_profile" in tdef:
        return str(tdef.get("chunk_profile"))
    envv = os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE")
    if envv:
        return str(envv)
    return "medium"
 def _norm_chunk_text(s: Any) -> str:
    if isinstance(s, str):
        return s.strip()
    return ""
 def _hash(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]
 def make_chunk_payloads(note: Any, *args, **kwargs) -> List[Dict[str, Any]]:
    """Erzeugt Payloads für alle Chunks der Note.
    Akzeptierte zusätzliche kwargs:
      - types_config: dict wie in config.yaml
      - search_root / vault_root: für Konfigsuche
    *args werden ignoriert (Kompatibilität zu älteren Aufrufern).
    """
-    note_d = _as_dict(note)
+    Build payloads for chunks. Tolerates legacy positional arguments.
    Returns list[dict] (ein Payload pro Chunk).
    """
    n = _as_dict(note)
    types_cfg = kwargs.get("types_config") or (args[0] if args else None)
    types_cfg = _load_types_config(types_cfg)
-    types_config = kwargs.get("types_config")
+    fm = _get_front(n)
-    search_root = kwargs.get("search_root") or kwargs.get("vault_root")
+    note_type = str(fm.get("type") or n.get("type") or "note")
-    types_cfg = _load_types_config(search_root, types_config)
+    cfg_for_type = types_cfg.get(note_type, {}) if isinstance(types_cfg, dict) else {}
-    typ = _resolve_type(note_d)
+    try:
-    note_id = _resolve_note_id(note_d) or ""
+        default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
    except Exception:
        default_rw = 1.0
-    r_weight = _compute_retriever_weight(note_d, types_cfg, typ)
+    retriever_weight = _coalesce(
-    c_profile = _compute_chunk_profile(note_d, types_cfg, typ)
+        fm.get("retriever_weight"),
        cfg_for_type.get("retriever_weight"),
        default_rw,
    )
    try:
        retriever_weight = float(retriever_weight)
    except Exception:
        retriever_weight = default_rw
-    out: List[Dict[str, Any]] = []
+    chunk_profile = _coalesce(
        fm.get("chunk_profile"),
        cfg_for_type.get("chunk_profile"),
        os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE", "medium"),
    )
    if not isinstance(chunk_profile, str):
        chunk_profile = "medium"
-    # 1) Falls der Parser bereits Chunks liefert, nutzen
+    note_id = n.get("note_id") or n.get("id") or fm.get("id")
-    pre = note_d.get("chunks")
+    title = n.get("title") or fm.get("title") or ""
-    if isinstance(pre, list) and pre:
+    path = n.get("path")
-        for idx, c in enumerate(pre):
+    if isinstance(path, pathlib.Path):
-            if isinstance(c, dict):
+        path = str(path)
                text = _norm_chunk_text(c.get("text") or c.get("body") or c.get("content"))
            else:
                text = _norm_chunk_text(getattr(c, "text", ""))
            if not text:
                # Fallback auf Note-Body, falls leer
                text = _resolve_body(note_d)
            if not text:
                continue
-            chunk_id = f"{note_id}#{idx:03d}" if note_id else _hash(text)[:8]
+    chunks = _iter_chunks(n, chunk_profile)
            payload = {
                "note_id": note_id,
                "chunk_id": chunk_id,
                "text": text,
                "retriever_weight": float(r_weight),
                "chunk_profile": str(c_profile),
                "type": typ,
            }
            out.append(payload)
-    # 2) Sonst als Single-Chunk aus Body/Text
+    payloads: List[Dict[str, Any]] = []
-    if not out:
+    for c in chunks:
-        text = _resolve_body(note_d)
+        idx = c.get("index", len(payloads))
-        if text:
+        text = c.get("text") if isinstance(c, dict) else (str(c) if c is not None else "")
-            chunk_id = f"{note_id}#000" if note_id else _hash(text)[:8]
+        if not isinstance(text, str):
-            out.append({
+            text = str(text or "")
                "note_id": note_id,
                "chunk_id": chunk_id,
                "text": text,
                "retriever_weight": float(r_weight),
                "chunk_profile": str(c_profile),
                "type": typ,
            })
-    return out
+        # deterministische chunk_id
        key = f"{note_id}|{idx}"
        h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:12]
        chunk_id = f"{note_id}-{idx:03d}-{h}" if note_id else h
        payload = {
            "note_id": note_id,
            "chunk_id": chunk_id,
            "index": idx,
            "title": title,
            "type": note_type,
            "path": path,
            "text": text,
            "retriever_weight": retriever_weight,
            "chunk_profile": chunk_profile,
        }
        # JSON-Serialisierbarkeit sicherstellen
        json.loads(json.dumps(payload, ensure_ascii=False))
        payloads.append(payload)
    return payloads