From bbc8f13944c053907305e7830f8b7a0109094249 Mon Sep 17 00:00:00 2001 From: Lars Date: Sun, 16 Nov 2025 18:56:33 +0100 Subject: [PATCH] Dateien nach "app/core" hochladen --- app/core/chunk_payload.py | 59 ++++++++++++--------------------------- 1 file changed, 18 insertions(+), 41 deletions(-) diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index 7ce39fe..5c23165 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -1,33 +1,12 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -app/core/chunk_payload.py (Mindnet V2 — robust) - -Aufgabe -------- -Erzeugt Chunk-Payloads aus den vom Chunker gelieferten "Chunk"-Objekten. -- Spiegelt `retriever_weight` und `chunk_profile` in **jedem** Chunk-Payload. -- Werteauflösung: Frontmatter > types.yaml > Defaults. -- Lädt `config/types.yaml` selbst, wenn `types_cfg` nicht übergeben wurde. - -Eingang -------- -- note: Dict mit mind. { frontmatter: {...}, id, type, title, path } -- note_path: Pfad der Note (für Payload-Feld `path`) -- chunks_from_chunker: Liste von Objekten mit Attributen/Feldern: - id, note_id, index, text, window, neighbors_prev, neighbors_next -- note_text: voller Text der Note (optional, kann leer sein) -- types_cfg: optional; wenn None → config wird intern geladen -- file_path: optional, für Debug/Tracing im Payload - -Ausgang (pro Chunk) -------------------- -- Pflichtfelder: note_id, chunk_id, index (0-basiert), ord (1-basiert), type, tags -- Texte: text, window -- Nachbarn: neighbors_prev, neighbors_next -- Spiegelungen: retriever_weight, chunk_profile -- Meta: source_path, path, section (leer), created/updated opt. aus Frontmatter +app/core/chunk_payload.py (Mindnet V2 — robust v2) +Änderungen ggü. v1: +- neighbors_prev / neighbors_next werden als **Array** persistiert ([], [id]). +- retriever_weight / chunk_profile werden je Chunk aufgelöst (Frontmatter > types.yaml > Defaults). +- Lädt config/types.yaml selbst, wenn types_cfg nicht übergeben wurde. """ from __future__ import annotations from typing import Any, Dict, List, Optional @@ -60,10 +39,8 @@ def _load_types_local() -> dict: return {} def _effective_chunk_profile(note_type: str, fm: Dict[str, Any], reg: dict) -> Optional[str]: - # Frontmatter zuerst if isinstance(fm.get("chunk_profile"), str): return fm.get("chunk_profile") - # Registry types = reg.get("types") if isinstance(reg.get("types"), dict) else reg if isinstance(types, dict): v = types.get(note_type, {}) @@ -74,12 +51,10 @@ def _effective_chunk_profile(note_type: str, fm: Dict[str, Any], reg: dict) -> O return None def _effective_retriever_weight(note_type: str, fm: Dict[str, Any], reg: dict) -> float: - # Frontmatter zuerst if fm.get("retriever_weight") is not None: v = _as_float(fm.get("retriever_weight")) if v is not None: return float(v) - # Registry-Pfade types = reg.get("types") if isinstance(reg.get("types"), dict) else reg candidates = [ f"{note_type}.retriever_weight", @@ -91,16 +66,21 @@ def _effective_retriever_weight(note_type: str, fm: Dict[str, Any], reg: dict) - "global.retriever.weight", ] for path in candidates: - # Wenn types == reg-root (flatten), erlauben sowohl "types.X" als auch "X" val = _deep_get(types, path) if "." in path else (types.get(path) if isinstance(types, dict) else None) if val is None and isinstance(reg, dict): - # versuche absolute Pfade val = _deep_get(reg, f"types.{path}") v = _as_float(val) if v is not None: return float(v) return 1.0 +def _as_list(x): + if x is None: + return [] + if isinstance(x, list): + return x + return [x] + def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], @@ -108,11 +88,10 @@ def make_chunk_payloads(note: Dict[str, Any], note_text: str = "", types_cfg: Optional[dict] = None, file_path: Optional[str] = None) -> List[Dict[str, Any]]: - fm = (note or {}).get("frontmatter", {}) + fm = (note or {}).get("frontmatter", {}) or {} note_type = fm.get("type") or note.get("type") or "concept" reg = types_cfg if isinstance(types_cfg, dict) else _load_types_local() - # Effektive Werte bestimmen cp = _effective_chunk_profile(note_type, fm, reg) rw = _effective_retriever_weight(note_type, fm, reg) @@ -121,9 +100,8 @@ def make_chunk_payloads(note: Dict[str, Any], tags = [tags] out: List[Dict[str, Any]] = [] - for idx, ch in enumerate(chunks_from_chunker): - # Chunk-Grunddaten (Attribute oder Keys) + # Attribute oder Keys (Chunk-Objekt oder Dict) cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None) nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id")) index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx) @@ -141,8 +119,8 @@ def make_chunk_payloads(note: Dict[str, Any], "tags": tags, "text": text, "window": window, - "neighbors_prev": prev_id, - "neighbors_next": next_id, + "neighbors_prev": _as_list(prev_id), + "neighbors_next": _as_list(next_id), "section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""), "path": note_path, "source_path": file_path or note_path, @@ -151,10 +129,9 @@ def make_chunk_payloads(note: Dict[str, Any], if cp is not None: pl["chunk_profile"] = cp - # Aufräumen: keine historischen Aliasfelder + # Aufräumen for alias in ("chunk_num", "Chunk_Number"): - if alias in pl: - pl.pop(alias, None) + pl.pop(alias, None) out.append(pl)