diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index 703fee6..6020f2c 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -1,9 +1,27 @@ # app/core/chunk_payload.py -# Line count: 214 +# Version: 1.2.0 (2025-11-08) +# Purpose: +# Build robust Qdrant payloads for CHUNK points. +# +# Highlights: +# - Works with dict-like chunks and simple objects; supports (text, idx) tuples. +# - Accepts legacy/extra kwargs (e.g., vault_root) without failing. +# - Copies canonical note fields onto each chunk (note_id/title/type/tags/path). +# - Sets 'text' and 'chunk_index' per chunk. +# - Reliably propagates `retriever_weight` onto every chunk if provided in +# frontmatter or explicitly. +# +# Usage: +# payloads = make_chunk_payloads(note, chunks, retriever_weight=None, base_payload=None, vault_root="/path/to/vault") +# +# Changelog: +# 1.2.0 (2025-11-08) Accept legacy kwargs, robust getters, propagate retriever_weight. +# 1.1.0 (2025-11-08) Initial robust rewrite with attribute/dict support. from __future__ import annotations -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Tuple def _get(obj: Any, key: str, default: Any = None) -> Any: @@ -12,13 +30,13 @@ def _get(obj: Any, key: str, default: Any = None) -> Any: if hasattr(obj, key): try: val = getattr(obj, key) - return val if val is not None else default + return default if val is None else val except Exception: pass if isinstance(obj, dict): if key in obj: val = obj.get(key, default) - return val if val is not None else default + return default if val is None else val return default @@ -37,7 +55,7 @@ def _get_from_frontmatter(fm: Dict[str, Any], key: str, default: Any = None) -> return default if key in fm: val = fm.get(key, default) - return val if val is not None else default + return default if val is None else val return default @@ -52,12 +70,12 @@ def _coerce_tags(val: Any) -> List[str]: return [] -def _resolve_retriever_weight( - fm: Dict[str, Any], - explicit: Optional[float], -) -> Optional[float]: +def _resolve_retriever_weight(fm: Dict[str, Any], explicit: Optional[float]) -> Optional[float]: if explicit is not None: - return explicit + try: + return float(explicit) + except Exception: + return None val = _get_from_frontmatter(fm, "retriever_weight", None) if isinstance(val, (int, float)): return float(val) @@ -69,58 +87,37 @@ def _resolve_retriever_weight( return None -def _resolve_note_fields(note: Any) -> Dict[str, Any]: - fm = _get_frontmatter(note) - - note_id = _get_from_frontmatter(fm, "id", None) - if note_id is None: - note_id = _get(note, "note_id", None) - if note_id is None: - note_id = _get(note, "id", None) - - title = _get_from_frontmatter(fm, "title", None) - if title is None: - title = _get(note, "title", None) - - ntype = _get_from_frontmatter(fm, "type", None) - if ntype is None: - ntype = _get(note, "type", None) - - tags = _get_from_frontmatter(fm, "tags", None) - if tags is None: - tags = _get(note, "tags", None) - tags = _coerce_tags(tags) - +def _resolve_path(note: Any, fm: Dict[str, Any], vault_root: Optional[str]) -> Optional[str]: path = _get_from_frontmatter(fm, "path", None) if path is None: - path = _get(note, "path", None) + path = _get(note, "path", None) or _get(note, "source", None) or _get(note, "filepath", None) if path is None: - path = _get(note, "source", None) - if path is None: - path = _get(note, "filepath", None) - - return { - "note_id": note_id, - "title": title, - "type": ntype, - "tags": tags, - "path": path, - "frontmatter": fm, - } + return None + try: + if vault_root: + vr = Path(vault_root) + rel = Path(path) + try: + return str(rel.relative_to(vr)) + except Exception: + return str(rel) + except Exception: + pass + return str(path) -def _extract_chunk_text_and_index( - chunk: Any, - fallback_index: int, -) -> Tuple[str, int]: - """ - Akzeptiert verschiedene Chunk-Formate: - - str (reiner Text) - - dict mit keys: text | window | body | content - - Objekt mit Attributen: text | window | body | content - - (text, idx) Tuple - """ - # Tuple (text, idx) +def _resolve_note_fields(note: Any, vault_root: Optional[str]) -> Dict[str, Any]: + fm = _get_frontmatter(note) + note_id = _get_from_frontmatter(fm, "id", None) or _get(note, "note_id", None) or _get(note, "id", None) + title = _get_from_frontmatter(fm, "title", None) or _get(note, "title", None) + ntype = _get_from_frontmatter(fm, "type", None) or _get(note, "type", None) + tags = _coerce_tags(_get_from_frontmatter(fm, "tags", None) or _get(note, "tags", None)) + path = _resolve_path(note, fm, vault_root) + return {"note_id": note_id, "title": title, "type": ntype, "tags": tags, "path": path, "frontmatter": fm} + + +def _extract_chunk_text_and_index(chunk: Any, fallback_index: int) -> Tuple[str, int]: + # (text, idx) tuple if isinstance(chunk, tuple) and len(chunk) == 2 and isinstance(chunk[0], str): txt, idx = chunk try: @@ -128,19 +125,12 @@ def _extract_chunk_text_and_index( except Exception: idx_int = fallback_index return txt, idx_int - - # String + # string if isinstance(chunk, str): return chunk, fallback_index - - # Dict + # dict if isinstance(chunk, dict): - txt = ( - chunk.get("text") - or chunk.get("window") - or chunk.get("body") - or chunk.get("content") - ) + txt = chunk.get("text") or chunk.get("window") or chunk.get("body") or chunk.get("content") if isinstance(txt, str): idx = chunk.get("index") try: @@ -148,8 +138,7 @@ def _extract_chunk_text_and_index( except Exception: idx_int = fallback_index return txt, idx_int - - # Objekt mit Attributen + # object with attributes for attr in ("text", "window", "body", "content"): if hasattr(chunk, attr): try: @@ -157,7 +146,6 @@ def _extract_chunk_text_and_index( except Exception: txt = None if isinstance(txt, str): - # Optionale "index"-Quelle idx = None if hasattr(chunk, "index"): try: @@ -169,32 +157,41 @@ def _extract_chunk_text_and_index( except Exception: idx_int = fallback_index return txt, idx_int - - # Wenn nichts passt -> klarer Fehler raise ValueError("Unsupported chunk format: cannot extract text/index") def make_chunk_payloads( note: Any, - chunks: Iterable[Any], + chunks, *, retriever_weight: Optional[float] = None, base_payload: Optional[Dict[str, Any]] = None, + vault_root: Optional[str] = None, + **kwargs, ) -> List[Dict[str, Any]]: + """Build Qdrant payloads for chunks from a parsed note and iterable of chunks. + + Parameters + ---------- + note : Any + Parsed note (dict or object with attributes). + chunks : Iterable[Any] + Chunks; supports str, dicts with 'text'/'window'/'body'/'content', objects with same, or (text, idx) tuples. + retriever_weight : Optional[float] + Optional override; if None, value is read from frontmatter. + base_payload : Optional[Dict[str, Any]] + Extra fields to copy onto each chunk. + vault_root : Optional[str] + Optional base path to compute relative 'path' if possible. + **kwargs : + Ignored extra options to remain compatible with callers. """ - Erzeugt Qdrant-Payloads für Chunk-Punkte. - - Kopiert Note-Metadaten (note_id/title/type/tags/path) - - Schreibt text + chunk_index je Chunk - - Setzt retriever_weight, wenn vorhanden/angegeben - """ - out: List[Dict[str, Any]] = [] - note_fields = _resolve_note_fields(note) + note_fields = _resolve_note_fields(note, vault_root) fm = note_fields["frontmatter"] rw = _resolve_retriever_weight(fm, retriever_weight) - # Basisfelder, die jeder Chunk tragen soll common: Dict[str, Any] = {} - if base_payload: + if isinstance(base_payload, dict): common.update({k: v for k, v in base_payload.items() if v is not None}) if note_fields.get("note_id") is not None: @@ -210,9 +207,10 @@ def make_chunk_payloads( if rw is not None: common["retriever_weight"] = rw + out: List[Dict[str, Any]] = [] for i, ch in enumerate(chunks): text, idx = _extract_chunk_text_and_index(ch, i) - payload = dict(common) # copy + payload = dict(common) payload["chunk_index"] = idx payload["text"] = text out.append(payload) diff --git a/app/core/note_payload.py b/app/core/note_payload.py index 697a10c..be0a390 100644 --- a/app/core/note_payload.py +++ b/app/core/note_payload.py @@ -1,38 +1,46 @@ # app/core/note_payload.py -# Line count: 118 +# Version: 1.2.0 (2025-11-08) +# Purpose: +# Build robust Qdrant payloads for NOTE points. +# +# Highlights: +# - Works with both dict-like inputs and ParsedNote-like objects (attribute access). +# - Accepts legacy/extra kwargs (e.g., vault_root) without failing. +# - Copies canonical fields: id/note_id, title, type, tags, path, text (if present). +# - Reliably propagates `retriever_weight` into the payload if set in frontmatter +# (frontmatter.retriever_weight or frontmatter.retriever.weight) or provided explicitly. +# +# Backward compatibility: +# - Signature accepts **kwargs (e.g., vault_root) because some callers pass it. +# - Both 'id' and 'note_id' are written for compatibility with existing queries. +# +# Usage: +# payload = make_note_payload(parsed_note, retriever_weight=None, vault_root="/path/to/vault") +# +# Changelog: +# 1.2.0 (2025-11-08) Accept legacy kwargs, robust getters, propagate retriever_weight. +# 1.1.0 (2025-11-08) Initial robust rewrite with attribute/dict support. from __future__ import annotations -from typing import Any, Dict, List, Optional, Union - -# Public API: -# make_note_payload(note, *, retriever_weight: Optional[float] = None) -> Dict[str, Any] -# -# Anforderungen: -# - Akzeptiert sowohl ParsedNote-ähnliche Objekte (Attribute) als auch Dicts. -# - Liest Felder bevorzugt aus Frontmatter: -# id/title/type/tags/path (+ retriever_weight) -# - Fällt robust auf Note-Attribute zurück (note_id, title, type, tags, path). -# - Setzt retriever_weight nur, wenn vorhanden/angegeben (keine Defaults). -# - Gibt eine reine Payload (dict) zurück, die in Qdrant geschrieben werden kann. +from pathlib import Path +from typing import Any, Dict, List, Optional def _get(obj: Any, key: str, default: Any = None) -> Any: - """Robuste Getter-Funktion: erst Attribute, dann Dict-Keys.""" + """Robust getter: attribute first, then dict.""" if obj is None: return default - # Attribute if hasattr(obj, key): try: val = getattr(obj, key) - return val if val is not None else default + return default if val is None else val except Exception: pass - # Dict if isinstance(obj, dict): if key in obj: val = obj.get(key, default) - return val if val is not None else default + return default if val is None else val return default @@ -40,11 +48,9 @@ def _get_frontmatter(note: Any) -> Dict[str, Any]: fm = _get(note, "frontmatter", None) if isinstance(fm, dict): return fm - # Manche Parser legen Meta in "meta" ab meta = _get(note, "meta", None) if isinstance(meta, dict) and isinstance(meta.get("frontmatter"), dict): return meta["frontmatter"] - # Fallback: leeres Dict return {} @@ -53,7 +59,7 @@ def _get_from_frontmatter(fm: Dict[str, Any], key: str, default: Any = None) -> return default if key in fm: val = fm.get(key, default) - return val if val is not None else default + return default if val is None else val return default @@ -63,24 +69,23 @@ def _coerce_tags(val: Any) -> List[str]: if isinstance(val, list): return [str(x) for x in val] if isinstance(val, str): - # YAML/Frontmatter kann tags als Komma-getrennte Zeichenkette liefern parts = [t.strip() for t in val.split(",")] return [p for p in parts if p] return [] -def _resolve_retriever_weight( - fm: Dict[str, Any], - explicit: Optional[float], -) -> Optional[float]: - # 1) explizit über Funktionsargument +def _resolve_retriever_weight(fm: Dict[str, Any], explicit: Optional[float]) -> Optional[float]: + # 1) explicit argument wins if explicit is not None: - return explicit - # 2) im Frontmatter direkt + try: + return float(explicit) + except Exception: + return None + # 2) frontmatter.retriever_weight val = _get_from_frontmatter(fm, "retriever_weight", None) if isinstance(val, (int, float)): return float(val) - # 3) verschachtelt: frontmatter.retriever.weight + # 3) frontmatter.retriever.weight retr = fm.get("retriever") if isinstance(retr, dict): v = retr.get("weight") @@ -89,59 +94,75 @@ def _resolve_retriever_weight( return None +def _resolve_path(note: Any, fm: Dict[str, Any], vault_root: Optional[str]) -> Optional[str]: + """Try to determine a stable relative path for diagnostics/traceability.""" + path = _get_from_frontmatter(fm, "path", None) + if path is None: + path = _get(note, "path", None) or _get(note, "source", None) or _get(note, "filepath", None) + if path is None: + return None + try: + if vault_root: + vr = Path(vault_root) + # Avoid Windows drive quirks: use Pure/Path consistently + rel = Path(path) + try: + path_rel = str(rel.relative_to(vr)) + except Exception: + # If 'path' is absolute not under vault_root, just return as-is + path_rel = str(rel) + return path_rel + except Exception: + pass + return str(path) + + def make_note_payload( note: Any, *, retriever_weight: Optional[float] = None, + vault_root: Optional[str] = None, + **kwargs, ) -> Dict[str, Any]: """ - Baut eine Qdrant-Payload für eine Note. - Erwartete Felder (wenn vorhanden): id/note_id, title, type, tags, path, text (optional) - retriever_weight wird gesetzt, wenn vorhanden/angegeben. + Build a Qdrant payload dict for a NOTE. + + Parameters + ---------- + note : Any + Parsed note (dict or object with attributes). + retriever_weight : Optional[float] + Optional override; if None, value is read from frontmatter. + vault_root : Optional[str] + Optional base path to compute relative 'path' if possible. + **kwargs : + Ignored extra options to remain compatible with callers. + + Returns + ------- + Dict[str, Any] + Payload ready for Qdrant upsert. """ fm = _get_frontmatter(note) - # ID priorisieren: frontmatter.id > note.note_id > note.id - note_id = _get_from_frontmatter(fm, "id", None) - if note_id is None: - note_id = _get(note, "note_id", None) - if note_id is None: - note_id = _get(note, "id", None) + # id / note_id + note_id = _get_from_frontmatter(fm, "id", None) or _get(note, "note_id", None) or _get(note, "id", None) + title = _get_from_frontmatter(fm, "title", None) or _get(note, "title", None) + ntype = _get_from_frontmatter(fm, "type", None) or _get(note, "type", None) + tags = _coerce_tags(_get_from_frontmatter(fm, "tags", None) or _get(note, "tags", None)) - title = _get_from_frontmatter(fm, "title", None) - if title is None: - title = _get(note, "title", None) - - ntype = _get_from_frontmatter(fm, "type", None) - if ntype is None: - ntype = _get(note, "type", None) - - tags = _get_from_frontmatter(fm, "tags", None) - if tags is None: - tags = _get(note, "tags", None) - tags = _coerce_tags(tags) - - path = _get_from_frontmatter(fm, "path", None) - if path is None: - path = _get(note, "path", None) - # Einige Parser führen den Pfad als "source" / "filepath" - if path is None: - path = _get(note, "source", None) - if path is None: - path = _get(note, "filepath", None) - - # Optionaler Volltext (nicht immer sinnvoll in notes-collection) + # Optional text for notes collection (only if present; we don't force it) text = _get(note, "text", None) if text is None and isinstance(note, dict): - # Parser variieren; manchmal "body" oder "content" text = note.get("body") or note.get("content") + # Path resolution + path = _resolve_path(note, fm, vault_root) + payload: Dict[str, Any] = {} if note_id is not None: - # Für Abwärtskompatibilität beide Felder schreiben - payload["id"] = note_id - payload["note_id"] = note_id - + payload["id"] = note_id # keep for legacy queries + payload["note_id"] = note_id # canonical if title is not None: payload["title"] = title if ntype is not None: