From 290b271cf6af06dbcf40527d48c77e853282165c Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 8 Nov 2025 21:36:53 +0100 Subject: [PATCH] Dateien nach "app/core" hochladen --- app/core/chunk_payload.py | 263 ++++++++++++++++++-------------------- app/core/note_payload.py | 216 +++++++++++++++---------------- 2 files changed, 230 insertions(+), 249 deletions(-) diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index e935ddf..1f28dd4 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -1,162 +1,149 @@ """ -chunk_payload.py — mindnet core payload builders -Version: 1.3.1 (2025-11-08) +chunk_payload.py +Version: 1.4.2 +Description: + Builds the payloads for *chunks* of a note destined for the Qdrant "chunks" collection. + - Defensive against both dict-like and attribute-like chunk objects. + - Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults). + - Ensures "retriever_weight" is present in every chunk payload, derived from the note. + - Preserves common chunk metadata (idx, offsets, tokens, section info, etc.). + - Tolerates legacy third positional parameter. -Purpose -------- -Build robust chunk payloads for Qdrant upserts. -This function is intentionally flexible about its signature to remain -compatible with different callers. - -Contract --------- -make_chunk_payloads(note, chunks, *args, **kwargs) -> List[Dict[str, Any]] - -Each returned item contains at least: -- note_id (str) -- title (str) -- type (str) -- path (str or None) -- tags (List[str]) -- chunk_index (int) -- text (str) -- retriever_weight (float or None) # if available +Public API: + make_chunk_payloads(parsed_note, chunks, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__) """ -from __future__ import annotations -from pathlib import Path -from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Union +from typing import Any, Dict, Iterable, List, Optional +def _as_dict(obj: Any) -> Dict[str, Any]: + if isinstance(obj, dict): + return obj + out = {} + for key in ( + "frontmatter", "fm", "meta", + "note_id", "id", + "title", "type", "tags", "aliases", + "created", "updated", "date", + "abs_path", "path", "rel_path", + ): + if hasattr(obj, key): + out[key] = getattr(obj, key) + return out -def _get(obj: Any, key: str, default: Any = None) -> Any: - if obj is None: +def _get(obj: Any, *keys: str, default: Any=None): + if isinstance(obj, dict): + for k in keys: + if k in obj: + return obj[k] return default - if isinstance(obj, Mapping): - return obj.get(key, default) - return getattr(obj, key, default) + for k in keys: + if hasattr(obj, k): + return getattr(obj, k) + return default - -def _get_frontmatter(note: Any) -> Mapping[str, Any]: - fm = _get(note, "frontmatter", {}) - if isinstance(fm, Mapping): - return fm - return {} - - -def _resolve_retriever_weight(explicit: Any, fm: Mapping[str, Any]) -> Optional[float]: - def to_float(v: Any) -> Optional[float]: - try: - if v is None: - return None - return float(v) - except Exception: - return None - - if explicit is not None: - return to_float(explicit) - - if "retriever_weight" in fm: - return to_float(fm.get("retriever_weight")) - - retr = fm.get("retriever") - if isinstance(retr, Mapping) and "weight" in retr: - return to_float(retr.get("weight")) - - return None - - -def _to_rel_path(abs_path: Optional[Union[str, Path]], vault_root: Optional[Union[str, Path]]) -> Optional[str]: - if abs_path is None: +def _as_list(val): + if val is None: return None + if isinstance(val, (list, tuple)): + return list(val) + if isinstance(val, str): + return [val] try: - p = Path(abs_path) - if vault_root: - try: - rp = p.relative_to(Path(vault_root)) - return str(rp) - except Exception: - return str(p) - return str(p) + return list(val) except Exception: - return str(abs_path) + return [val] - -def _coerce_chunks(chunks_obj: Any) -> List[Any]: - """Accept lists of dicts/objects or generators; coerce to list safely.""" - if chunks_obj is None: - return [] - if isinstance(chunks_obj, list): - return chunks_obj +def _coerce_float(val: Any, default: float) -> float: + if val is None: + return float(default) try: - return list(chunks_obj) + return float(val) except Exception: - return [] - - -def _get_chunk_text(c: Any) -> str: - for key in ("text", "chunk", "body", "content"): - v = _get(c, key) - if isinstance(v, str) and v.strip(): - return v - # last resort: string repr - return str(c) if c is not None else "" + return float(default) +def _clean(d: Dict[str, Any]) -> Dict[str, Any]: + return {k: v for k, v in d.items() if v is not None} def make_chunk_payloads( - *args: Any, - **kwargs: Any, + parsed_note: Any, + chunks: Iterable[Any], + *_, # legacy extra positional parameters tolerated + retriever_weight: Optional[float] = None, + vault_root: Optional[str] = None, + type_defaults: Optional[Dict[str, Dict[str, Any]]] = None, + **__, # ignore unexpected kwargs ) -> List[Dict[str, Any]]: - """ - Flexible signature for backward/forward compatibility. - Expected positional args: - args[0] -> note (ParsedNote or Mapping) - args[1] -> chunks (Iterable) - args[2] -> (optional) config/ignored - Recognized kwargs: - - vault_root: base path for relative paths (optional) - - retriever_weight: explicit override (optional) - """ - if not args: - raise TypeError("make_chunk_payloads(note, chunks, *_) requires at least (note, chunks).") + nd = _as_dict(parsed_note) + fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {} - note = args[0] - chunks = args[1] if len(args) > 1 else kwargs.get("chunks") - chunks_list = _coerce_chunks(chunks) + note_id = _get(nd, "note_id", "id") or fm.get("id") + title = _get(nd, "title") or fm.get("title") + ntype = _get(nd, "type") or fm.get("type") or "concept" - vault_root = kwargs.get("vault_root") - explicit_weight = kwargs.get("retriever_weight") + # Effective path for source reference + abs_path = _get(nd, "abs_path", "path") + rel_path = _get(nd, "rel_path") + if vault_root and abs_path and not rel_path: + try: + from pathlib import Path + rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve())) + except Exception: + rel_path = _get(nd, "path") or abs_path - fm = _get_frontmatter(note) + # Effective chunk_profile + chunk_profile = fm.get("chunk_profile") + if not chunk_profile and type_defaults and ntype in type_defaults: + chunk_profile = type_defaults[ntype].get("chunk_profile") - note_id = _get(note, "note_id") or _get(note, "id") or fm.get("id") - title = _get(note, "title") or fm.get("title") - ntype = _get(note, "type") or fm.get("type") - tags = _get(note, "tags") or fm.get("tags") or [] - if not isinstance(tags, list): - tags = list(tags) if tags else [] - - path_val = _get(note, "path") or _get(note, "abs_path") or fm.get("path") - rweight = _resolve_retriever_weight(explicit_weight, fm) - - base = { - "note_id": note_id, - "title": title, - "type": ntype, - "tags": tags, - "path": _to_rel_path(path_val, vault_root), - "retriever_weight": rweight, - } - - payloads: List[Dict[str, Any]] = [] - for idx, ch in enumerate(chunks_list): - text = _get_chunk_text(ch) - item = dict(base) - item.update( - { - "chunk_index": idx, - "text": text, - } + # Resolve retriever_weight once at note level, apply to all chunks + if retriever_weight is None: + retriever_weight = ( + fm.get("retriever_weight") + or (fm.get("retriever", {}) or {}).get("weight") ) - payloads.append(item) + if retriever_weight is None and type_defaults and ntype in type_defaults: + retriever_weight = type_defaults[ntype].get("retriever_weight") - return payloads + retriever_weight = _coerce_float(retriever_weight, default=1.0) + + out: List[Dict[str, Any]] = [] + for i, ch in enumerate(chunks): + cd = ch if isinstance(ch, dict) else {} + # Basic fields with many aliases + chunk_id = _get(ch, "chunk_id", "id", default=None) + idx = _get(ch, "idx", "index", default=i) + text = _get(ch, "text", "content", "body", "chunk_text", default=None) + char_start = _get(ch, "char_start", "start", "begin", default=None) + char_end = _get(ch, "char_end", "end", "stop", default=None) + token_count = _get(ch, "token_count", "tokens", "n_tokens", default=None) + section = _get(ch, "section", "heading", default=None) + section_path = _get(ch, "section_path", "hpath", default=None) + + payload = _clean({ + "note_id": note_id, + "title": title, + "type": ntype, + "path": rel_path or abs_path, + "chunk_profile": chunk_profile, + "retriever_weight": retriever_weight, + + "chunk_id": chunk_id or (f"{note_id}#ch{idx}" if note_id is not None else None), + "chunk_index": idx, + "text": text, + "char_start": char_start, + "char_end": char_end, + "token_count": token_count, + "section": section, + "section_path": section_path, + }) + + # If the chunk object carries an existing mapping of extra metadata, preserve it. + if isinstance(ch, dict): + # Avoid overwriting the fields we already normalized + extras = {k: v for k, v in ch.items() if k not in payload and v is not None} + if extras: + payload.update(extras) + + out.append(payload) + + return out diff --git a/app/core/note_payload.py b/app/core/note_payload.py index c2dff31..e01139f 100644 --- a/app/core/note_payload.py +++ b/app/core/note_payload.py @@ -1,139 +1,133 @@ """ -note_payload.py — mindnet core payload builders -Version: 1.3.1 (2025-11-08) +note_payload.py +Version: 1.4.2 +Description: + Builds the payload for a *note* document destined for the Qdrant "notes" collection. + - Defensive against both dict-like and attribute-like "ParsedNote" inputs. + - Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults). + - Ensures "retriever_weight" is always present in the payload (float), resolved as: + kwarg retriever_weight > frontmatter.retriever_weight > frontmatter.retriever.weight > + type_defaults[type].retriever_weight > 1.0 + - Preserves common metadata fields expected downstream. -Purpose -------- -Build a robust, forward-compatible note payload for Qdrant upserts. -This module is intentionally defensive: -- Accepts both dict-like "parsed note" objects and dataclass/objects with attributes. -- Tolerates extra kwargs from different callers (e.g., `vault_root`, `prefix`, etc.). -- Ensures `retriever_weight` is resolved and present in the payload if available. - -Contract --------- -make_note_payload(note, **kwargs) -> Dict[str, Any] - -Expected minimal fields in returned payload: -- note_id (str) -- title (str) -- type (str) -- path (str or None) # relative to vault_root when provided -- tags (List[str]) -- retriever_weight (float or None) # if available +Public API: + make_note_payload(parsed_note, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__) """ -from __future__ import annotations -from pathlib import Path -from typing import Any, Dict, Iterable, List, Mapping, Optional, Union +from typing import Any, Dict, Optional +def _as_dict(obj: Any) -> Dict[str, Any]: + if isinstance(obj, dict): + return obj + # Try common attribute names to build a dict view + out = {} + for key in ( + "frontmatter", "fm", "meta", + "note_id", "id", + "title", "type", "tags", "aliases", + "created", "updated", "date", + "abs_path", "path", "rel_path", + ): + if hasattr(obj, key): + out[key] = getattr(obj, key) + return out -def _get(obj: Any, key: str, default: Any = None) -> Any: - """Try to read `key` from mapping or attribute; else default.""" - if obj is None: +def _get(obj: Any, *keys: str, default: Any=None): + """Get first existing key/attribute from obj.""" + if isinstance(obj, dict): + for k in keys: + if k in obj: + return obj[k] return default - if isinstance(obj, Mapping): - return obj.get(key, default) # attribute access - return getattr(obj, key, default) + for k in keys: + if hasattr(obj, k): + return getattr(obj, k) + return default - -def _get_frontmatter(note: Any) -> Mapping[str, Any]: - fm = _get(note, "frontmatter", {}) - if isinstance(fm, Mapping): - return fm - return {} # be safe - - -def _resolve_retriever_weight(explicit: Any, fm: Mapping[str, Any]) -> Optional[float]: - """ - Priority: - 1) explicit kwarg retriever_weight - 2) frontmatter['retriever_weight'] - 3) frontmatter['retriever']['weight'] - """ - def to_float(v: Any) -> Optional[float]: - try: - if v is None: - return None - return float(v) - except Exception: - return None - - if explicit is not None: - return to_float(explicit) - - if "retriever_weight" in fm: - return to_float(fm.get("retriever_weight")) - - retr = fm.get("retriever") - if isinstance(retr, Mapping) and "weight" in retr: - return to_float(retr.get("weight")) - - return None - - -def _to_rel_path(abs_path: Optional[Union[str, Path]], vault_root: Optional[Union[str, Path]]) -> Optional[str]: - if abs_path is None: +def _as_list(val): + if val is None: return None + if isinstance(val, (list, tuple)): + return list(val) + if isinstance(val, str): + return [val] try: - p = Path(abs_path) - if vault_root: - try: - rp = p.relative_to(Path(vault_root)) - return str(rp) - except Exception: - return str(p) - return str(p) + return list(val) # best-effort except Exception: - return str(abs_path) + return [val] +def _coerce_float(val: Any, default: float) -> float: + if val is None: + return float(default) + try: + return float(val) + except Exception: + return float(default) + +def _clean(d: Dict[str, Any]) -> Dict[str, Any]: + return {k: v for k, v in d.items() if v is not None} def make_note_payload( - note: Any, - *args, # tolerate older/other callers - **kwargs: Any, + parsed_note: Any, + *_, # ignore legacy extra positional args for backward compatibility + retriever_weight: Optional[float] = None, + vault_root: Optional[str] = None, + type_defaults: Optional[Dict[str, Dict[str, Any]]] = None, + **__, # ignore any unexpected kwargs ) -> Dict[str, Any]: - """ - Build a normalized note payload for Qdrant. - Unknown kwargs are ignored to keep the function forward-compatible. + nd = _as_dict(parsed_note) + fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {} - Recognized kwargs: - - vault_root: base path to make `path` relative (optional) - - retriever_weight: explicit override (optional) - """ - vault_root = kwargs.get("vault_root") - explicit_weight = kwargs.get("retriever_weight") + note_id = _get(nd, "note_id", "id") or fm.get("id") + title = _get(nd, "title") or fm.get("title") + ntype = _get(nd, "type") or fm.get("type") or "concept" - fm = _get_frontmatter(note) + # Path handling + abs_path = _get(nd, "abs_path", "path") + rel_path = _get(nd, "rel_path") + if vault_root and abs_path and not rel_path: + try: + from pathlib import Path + rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve())) + except Exception: + rel_path = _get(nd, "path") or abs_path - note_id = _get(note, "note_id") or _get(note, "id") - if not note_id: - # Try from frontmatter - note_id = fm.get("id") + # Tags / aliases + tags = _as_list(_get(nd, "tags") or fm.get("tags")) + aliases = _as_list(_get(nd, "aliases") or fm.get("aliases")) - title = _get(note, "title") or fm.get("title") - ntype = _get(note, "type") or fm.get("type") + # Created/Updated + created = _get(nd, "created", "date") or fm.get("created") or fm.get("date") + updated = _get(nd, "updated") or fm.get("updated") - tags = _get(note, "tags") or fm.get("tags") or [] - if not isinstance(tags, list): - tags = list(tags) if tags else [] + # Chunk profile (effective) + chunk_profile = fm.get("chunk_profile") + if not chunk_profile and type_defaults and ntype in type_defaults: + chunk_profile = type_defaults[ntype].get("chunk_profile") - path_val = _get(note, "path") or _get(note, "abs_path") or fm.get("path") + # Retriever weight resolution (ensures it is present) + if retriever_weight is None: + retriever_weight = ( + fm.get("retriever_weight") + or (fm.get("retriever", {}) or {}).get("weight") + ) + if retriever_weight is None and type_defaults and ntype in type_defaults: + retriever_weight = type_defaults[ntype].get("retriever_weight") - payload: Dict[str, Any] = { + retriever_weight = _coerce_float(retriever_weight, default=1.0) + + payload = _clean({ "note_id": note_id, + "id": note_id, # keep both, many downstream tools expect 'id' "title": title, "type": ntype, "tags": tags, - "path": _to_rel_path(path_val, vault_root), - "retriever_weight": _resolve_retriever_weight(explicit_weight, fm), - } - - # Also surface explicit frontmatter fields (non-conflicting) if present - for k in ("status", "created", "updated"): - v = fm.get(k) - if v is not None and k not in payload: - payload[k] = v - + "aliases": aliases, + "created": created, + "updated": updated, + "path": rel_path or abs_path, + "chunk_profile": chunk_profile, + "retriever_weight": retriever_weight, + }) return payload