app/core/chunk_payload.py aktualisiert

2025-11-08 21:22:17 +01:00 · 2025-11-08 21:22:17 +01:00 · 2de786fc64
commit 2de786fc64
parent 2ddf034983
1 changed files with 197 additions and 135 deletions
--- a/app/core/chunk_payload.py
+++ b/app/core/chunk_payload.py
@ -1,158 +1,220 @@
+# app/core/chunk_payload.py
+# Line count: 214
+
 from __future__ import annotations
-from typing import Any, Dict, Iterable, List, Optional, Union

-# ---- Helpers ----
-def _coerce_float(val: Any) -> Optional[float]:
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+
+def _get(obj: Any, key: str, default: Any = None) -> Any:
+    if obj is None:
+        return default
+    if hasattr(obj, key):
+        try:
+            val = getattr(obj, key)
+            return val if val is not None else default
+        except Exception:
+            pass
+    if isinstance(obj, dict):
+        if key in obj:
+            val = obj.get(key, default)
+            return val if val is not None else default
+    return default
+
+
+def _get_frontmatter(note: Any) -> Dict[str, Any]:
+    fm = _get(note, "frontmatter", None)
+    if isinstance(fm, dict):
+        return fm
+    meta = _get(note, "meta", None)
+    if isinstance(meta, dict) and isinstance(meta.get("frontmatter"), dict):
+        return meta["frontmatter"]
+    return {}
+
+
+def _get_from_frontmatter(fm: Dict[str, Any], key: str, default: Any = None) -> Any:
+    if not isinstance(fm, dict):
+        return default
+    if key in fm:
+        val = fm.get(key, default)
+        return val if val is not None else default
+    return default
+
+
+def _coerce_tags(val: Any) -> List[str]:
    if val is None:
-        return None
-    try:
-        if isinstance(val, (int, float)):
-            return float(val)
-        if isinstance(val, str):
-            v = val.strip()
-            if not v:
-                return None
-            return float(v.replace(",", "."))
-    except Exception:
-        return None
-    return None
-
-def _extract_weight(frontmatter: Dict[str, Any], explicit: Optional[float]) -> Optional[float]:
-    if explicit is not None:
-        return _coerce_float(explicit)
-    if frontmatter is None:
-        return None
-    if "retriever_weight" in frontmatter:
-        return _coerce_float(frontmatter.get("retriever_weight"))
-    # also accept nested style: retriever: { weight: 0.8 }
-    retriever = frontmatter.get("retriever")
-    if isinstance(retriever, dict) and "weight" in retriever:
-        return _coerce_float(retriever.get("weight"))
-    return None
-
-def _ensure_list(x: Any) -> List[Any]:
-    if x is None:
        return []
-    if isinstance(x, list):
-        return x
-    return [x]
+    if isinstance(val, list):
+        return [str(x) for x in val]
+    if isinstance(val, str):
+        parts = [t.strip() for t in val.split(",")]
+        return [p for p in parts if p]
+    return []

-def _resolve_note_id(frontmatter: Dict[str, Any], kw_note_id: Optional[str]) -> Optional[str]:
-    if kw_note_id:
-        return kw_note_id
-    if not isinstance(frontmatter, dict):
-        return None
-    return frontmatter.get("id") or frontmatter.get("note_id")

-def _base_fields(frontmatter: Dict[str, Any], note_id: Optional[str], path: str) -> Dict[str, Any]:
-    title = None
-    typ = None
-    tags = None
-    if isinstance(frontmatter, dict):
-        title = frontmatter.get("title")
-        typ = frontmatter.get("type") or frontmatter.get("note_type")
-        # tags can be list[str] or comma separated string
-        tags = frontmatter.get("tags")
-        if isinstance(tags, str):
-            tags = [t.strip() for t in tags.split(",") if t.strip()]
+def _resolve_retriever_weight(
+    fm: Dict[str, Any],
+    explicit: Optional[float],
+) -> Optional[float]:
+    if explicit is not None:
+        return explicit
+    val = _get_from_frontmatter(fm, "retriever_weight", None)
+    if isinstance(val, (int, float)):
+        return float(val)
+    retr = fm.get("retriever")
+    if isinstance(retr, dict):
+        v = retr.get("weight")
+        if isinstance(v, (int, float)):
+            return float(v)
+    return None
+
+
+def _resolve_note_fields(note: Any) -> Dict[str, Any]:
+    fm = _get_frontmatter(note)
+
+    note_id = _get_from_frontmatter(fm, "id", None)
+    if note_id is None:
+        note_id = _get(note, "note_id", None)
+    if note_id is None:
+        note_id = _get(note, "id", None)
+
+    title = _get_from_frontmatter(fm, "title", None)
+    if title is None:
+        title = _get(note, "title", None)
+
+    ntype = _get_from_frontmatter(fm, "type", None)
+    if ntype is None:
+        ntype = _get(note, "type", None)
+
+    tags = _get_from_frontmatter(fm, "tags", None)
+    if tags is None:
+        tags = _get(note, "tags", None)
+    tags = _coerce_tags(tags)
+
+    path = _get_from_frontmatter(fm, "path", None)
+    if path is None:
+        path = _get(note, "path", None)
+    if path is None:
+        path = _get(note, "source", None)
+    if path is None:
+        path = _get(note, "filepath", None)
+
    return {
        "note_id": note_id,
        "title": title,
-        "type": typ,
+        "type": ntype,
        "tags": tags,
-        "path": path or None,
+        "path": path,
+        "frontmatter": fm,
    }

-# ---- Public API ----
+
+def _extract_chunk_text_and_index(
+    chunk: Any,
+    fallback_index: int,
+) -> Tuple[str, int]:
+    """
+    Akzeptiert verschiedene Chunk-Formate:
+    - str (reiner Text)
+    - dict mit keys: text | window | body | content
+    - Objekt mit Attributen: text | window | body | content
+    - (text, idx) Tuple
+    """
+    # Tuple (text, idx)
+    if isinstance(chunk, tuple) and len(chunk) == 2 and isinstance(chunk[0], str):
+        txt, idx = chunk
+        try:
+            idx_int = int(idx)
+        except Exception:
+            idx_int = fallback_index
+        return txt, idx_int
+
+    # String
+    if isinstance(chunk, str):
+        return chunk, fallback_index
+
+    # Dict
+    if isinstance(chunk, dict):
+        txt = (
+            chunk.get("text")
+            or chunk.get("window")
+            or chunk.get("body")
+            or chunk.get("content")
+        )
+        if isinstance(txt, str):
+            idx = chunk.get("index")
+            try:
+                idx_int = int(idx) if idx is not None else fallback_index
+            except Exception:
+                idx_int = fallback_index
+            return txt, idx_int
+
+    # Objekt mit Attributen
+    for attr in ("text", "window", "body", "content"):
+        if hasattr(chunk, attr):
+            try:
+                txt = getattr(chunk, attr)
+            except Exception:
+                txt = None
+            if isinstance(txt, str):
+                # Optionale "index"-Quelle
+                idx = None
+                if hasattr(chunk, "index"):
+                    try:
+                        idx = getattr(chunk, "index")
+                    except Exception:
+                        idx = None
+                try:
+                    idx_int = int(idx) if idx is not None else fallback_index
+                except Exception:
+                    idx_int = fallback_index
+                return txt, idx_int
+
+    # Wenn nichts passt -> klarer Fehler
+    raise ValueError("Unsupported chunk format: cannot extract text/index")
+
+
 def make_chunk_payloads(
-    frontmatter: Dict[str, Any],
-    *args,
-    note_id: Optional[str] = None,
-    chunks: Optional[Iterable[Any]] = None,
-    path: str = "",
-    chunk_profile: Optional[str] = None,
+    note: Any,
+    chunks: Iterable[Any],
+    *,
    retriever_weight: Optional[float] = None,
-    **kwargs,
+    base_payload: Optional[Dict[str, Any]] = None,
 ) -> List[Dict[str, Any]]:
    """
-    Build chunk payload dictionaries for Qdrant.
-
-    This function is intentionally permissive to stay compatible with older callers:
-    - If `chunks` is a list of dictionaries that already contain payload-like fields,
-      those are augmented.
-    - If `chunks` is a list of strings, minimal payloads are created.
-    - If `chunks` is a list of dicts with keys like `text`, `window`, or `index`, they are normalized.
-
-    Always injects `retriever_weight` into each payload when available (from explicit arg or frontmatter).
+    Erzeugt Qdrant-Payloads für Chunk-Punkte.
+    - Kopiert Note-Metadaten (note_id/title/type/tags/path)
+    - Schreibt text + chunk_index je Chunk
+    - Setzt retriever_weight, wenn vorhanden/angegeben
    """
-    # Backward-compat for callers that might pass via kwargs
-    if chunks is None:
-        chunks = kwargs.get("payloads") or kwargs.get("pls") or kwargs.get("items") or kwargs.get("chunk_items")
-
-    note_id_resolved = _resolve_note_id(frontmatter, note_id)
-    weight = _extract_weight(frontmatter, retriever_weight)
-    base = _base_fields(frontmatter, note_id_resolved, path)
-
    out: List[Dict[str, Any]] = []
-    for idx, item in enumerate(_ensure_list(chunks)):
-        # Case A: already a full payload dict (heuristic: has 'text' or 'window' or 'note_id' keys)
-        if isinstance(item, dict) and ("text" in item or "window" in item or "note_id" in item):
-            pl = dict(item)  # shallow copy
-            # ensure base fields exist if missing
-            for k, v in base.items():
-                pl.setdefault(k, v)
-            # ensure chunk_index if not present
-            pl.setdefault("chunk_index", item.get("index", idx))
-            # inject retriever_weight
-            if weight is not None:
-                pl["retriever_weight"] = weight
-            out.append(pl)
-            continue
+    note_fields = _resolve_note_fields(note)
+    fm = note_fields["frontmatter"]
+    rw = _resolve_retriever_weight(fm, retriever_weight)

-        # Case B: item is a dict with nested 'payload'
-        if isinstance(item, dict) and "payload" in item and isinstance(item["payload"], dict):
-            pl = dict(item["payload"])
-            for k, v in base.items():
-                pl.setdefault(k, v)
-            pl.setdefault("chunk_index", pl.get("index", idx))
-            if weight is not None:
-                pl["retriever_weight"] = weight
-            out.append(pl)
-            continue
+    # Basisfelder, die jeder Chunk tragen soll
+    common: Dict[str, Any] = {}
+    if base_payload:
+        common.update({k: v for k, v in base_payload.items() if v is not None})

-        # Case C: item is a plain string -> treat as text (no window context)
-        if isinstance(item, str):
-            text_val = item
-            pl = {
-                **base,
-                "chunk_index": idx,
-                "text": text_val,
-                "window": text_val,
-            }
-            if weight is not None:
-                pl["retriever_weight"] = weight
-            out.append(pl)
-            continue
+    if note_fields.get("note_id") is not None:
+        common["note_id"] = note_fields["note_id"]
+    if note_fields.get("title") is not None:
+        common["title"] = note_fields["title"]
+    if note_fields.get("type") is not None:
+        common["type"] = note_fields["type"]
+    if note_fields.get("tags"):
+        common["tags"] = note_fields["tags"]
+    if note_fields.get("path") is not None:
+        common["path"] = note_fields["path"]
+    if rw is not None:
+        common["retriever_weight"] = rw

-        # Case D: item has 'text'/'window' under different names
-        if isinstance(item, dict):
-            text_val = item.get("text") or item.get("body") or item.get("content") or ""
-            window_val = item.get("window") or text_val
-            pl = {
-                **base,
-                "chunk_index": item.get("chunk_index", item.get("index", idx)),
-                "text": text_val,
-                "window": window_val,
-            }
-            if weight is not None:
-                pl["retriever_weight"] = weight
-            out.append(pl)
-            continue
-
-        # Fallback: minimal payload
-        pl = {**base, "chunk_index": idx}
-        if weight is not None:
-            pl["retriever_weight"] = weight
-        out.append(pl)
+    for i, ch in enumerate(chunks):
+        text, idx = _extract_chunk_text_and_index(ch, i)
+        payload = dict(common)  # copy
+        payload["chunk_index"] = idx
+        payload["text"] = text
+        out.append(payload)

    return out