Dateien nach "app/core" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-11-08 21:40:13 +01:00
parent 290b271cf6
commit c93e7ad598
2 changed files with 309 additions and 236 deletions

View File

@ -1,149 +1,174 @@
"""
chunk_payload.py
Version: 1.4.2
Description:
Builds the payloads for *chunks* of a note destined for the Qdrant "chunks" collection.
- Defensive against both dict-like and attribute-like chunk objects.
- Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults).
- Ensures "retriever_weight" is present in every chunk payload, derived from the note.
- Preserves common chunk metadata (idx, offsets, tokens, section info, etc.).
- Tolerates legacy third positional parameter.
Public API:
make_chunk_payloads(parsed_note, chunks, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__)
"""
chunk_payload.py Mindnet core payload builder (v0.5, 2025-11-08)
Purpose
-------
Builds a list of **JSON-serializable** payload dicts for chunks of a note to be
stored in `<prefix>_chunks`. Ensures `retriever_weight` is set on every chunk.
Public API
----------
make_chunk_payloads(parsed_note, chunks, *args, retriever_weight=None, vault_root=None, type_defaults=None, **kwargs) -> list[dict]
"""
from typing import Any, Dict, Iterable, List, Optional
from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Union, Mapping
import datetime, math
def _as_dict(obj: Any) -> Dict[str, Any]:
if isinstance(obj, dict):
return obj
out = {}
for key in (
"frontmatter", "fm", "meta",
"note_id", "id",
"title", "type", "tags", "aliases",
"created", "updated", "date",
"abs_path", "path", "rel_path",
):
if hasattr(obj, key):
out[key] = getattr(obj, key)
return out
Json = Union[None, bool, int, float, str, list, dict]
def _get(obj: Any, *keys: str, default: Any=None):
if isinstance(obj, dict):
for k in keys:
if k in obj:
return obj[k]
return default
for k in keys:
if hasattr(obj, k):
return getattr(obj, k)
# ------------------------- helpers -------------------------
def _is_mapping(x: Any) -> bool:
return isinstance(x, Mapping)
def _get(obj: Any, *names: str, default: Any=None) -> Any:
for n in names:
if hasattr(obj, n):
try:
return getattr(obj, n)
except Exception:
pass
if _is_mapping(obj) and n in obj:
try:
return obj[n]
except Exception:
pass
return default
def _as_list(val):
if val is None:
def _to_float(x: Any, default: float=1.0) -> float:
if x is None:
return float(default)
if isinstance(x, (int, float)) and math.isfinite(x):
return float(x)
try:
s = str(x).strip().replace(',', '.')
return float(s)
except Exception:
return float(default)
def _ensure_list(x: Any) -> list:
if x is None:
return []
if isinstance(x, list):
return x
if isinstance(x, (set, tuple)):
return list(x)
return [x]
def _sanitize(obj: Any) -> Json:
if obj is None or isinstance(obj, (bool, int, float, str)):
return obj
if callable(obj):
return None
if isinstance(val, (list, tuple)):
return list(val)
if isinstance(val, str):
return [val]
try:
return list(val)
except Exception:
return [val]
def _coerce_float(val: Any, default: float) -> float:
if val is None:
return float(default)
try:
return float(val)
except Exception:
return float(default)
def _clean(d: Dict[str, Any]) -> Dict[str, Any]:
return {k: v for k, v in d.items() if v is not None}
def make_chunk_payloads(
parsed_note: Any,
chunks: Iterable[Any],
*_, # legacy extra positional parameters tolerated
retriever_weight: Optional[float] = None,
vault_root: Optional[str] = None,
type_defaults: Optional[Dict[str, Dict[str, Any]]] = None,
**__, # ignore unexpected kwargs
) -> List[Dict[str, Any]]:
nd = _as_dict(parsed_note)
fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {}
note_id = _get(nd, "note_id", "id") or fm.get("id")
title = _get(nd, "title") or fm.get("title")
ntype = _get(nd, "type") or fm.get("type") or "concept"
# Effective path for source reference
abs_path = _get(nd, "abs_path", "path")
rel_path = _get(nd, "rel_path")
if vault_root and abs_path and not rel_path:
if isinstance(obj, (list, tuple, set)):
return [_sanitize(v) for v in obj]
if isinstance(obj, dict):
out = {}
for k, v in obj.items():
if callable(v):
continue
out[str(k)] = _sanitize(v)
return out
if isinstance(obj, Path):
return str(obj)
if isinstance(obj, datetime.datetime):
return obj.isoformat()
if hasattr(obj, "__str__"):
try:
from pathlib import Path
rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve()))
return str(obj)
except Exception:
rel_path = _get(nd, "path") or abs_path
return None
return None
# Effective chunk_profile
chunk_profile = fm.get("chunk_profile")
if not chunk_profile and type_defaults and ntype in type_defaults:
chunk_profile = type_defaults[ntype].get("chunk_profile")
def _compute_retriever_weight(explicit: Any, frontmatter: dict, type_defaults: Optional[dict], note_type: Optional[str]) -> float:
if explicit is not None:
return _to_float(explicit, 1.0)
for key in ("retriever_weight", "retriever.weight", "retrieverWeight"):
if key in frontmatter:
return _to_float(frontmatter.get(key), 1.0)
if type_defaults and note_type:
tdef = type_defaults.get(note_type) or {}
for key in ("retriever_weight", "retriever.weight", "retrieverWeight"):
if key in tdef:
return _to_float(tdef.get(key), 1.0)
return 1.0
# Resolve retriever_weight once at note level, apply to all chunks
if retriever_weight is None:
retriever_weight = (
fm.get("retriever_weight")
or (fm.get("retriever", {}) or {}).get("weight")
)
if retriever_weight is None and type_defaults and ntype in type_defaults:
retriever_weight = type_defaults[ntype].get("retriever_weight")
# ------------------------- public API -------------------------
retriever_weight = _coerce_float(retriever_weight, default=1.0)
def make_chunk_payloads(parsed_note: Any,
chunks: List[Any],
*args,
retriever_weight: Optional[float]=None,
vault_root: Optional[str]=None,
type_defaults: Optional[dict]=None,
**kwargs) -> List[Dict[str, Json]]:
"""
Build JSON-safe payloads for all chunks in a note.
out: List[Dict[str, Any]] = []
Parameters
----------
parsed_note : object or dict
chunks : list of objects or dicts
Expected per-chunk fields/keys (best-effort): text, index, start/end offsets,
tokens/n_tokens, section/heading.
retriever_weight : float|None
vault_root : str|None
type_defaults : dict|None
Returns
-------
list[dict] suitable for Qdrant payloads
"""
fm = _get(parsed_note, "frontmatter", "fm", default={})
if not isinstance(fm, dict):
fm = {}
note_id = _get(parsed_note, "note_id", "id", default=fm.get("id"))
title = _get(parsed_note, "title", default=fm.get("title"))
ntype = _get(parsed_note, "type", default=fm.get("type"))
raw_path = _get(parsed_note, "path", "rel_path", "relpath", default=fm.get("path"))
chunk_profile = _get(parsed_note, "chunk_profile", "profile", default=fm.get("chunk_profile"))
tags = _ensure_list(_get(parsed_note, "tags", default=fm.get("tags")))
rel_path = raw_path
if raw_path and vault_root:
try:
rel_path = str(Path(raw_path)).replace(str(Path(vault_root)), "").lstrip("/\\")
except Exception:
rel_path = str(raw_path)
rw = _compute_retriever_weight(retriever_weight, fm, type_defaults, ntype)
out: List[Dict[str, Json]] = []
for i, ch in enumerate(chunks):
cd = ch if isinstance(ch, dict) else {}
# Basic fields with many aliases
chunk_id = _get(ch, "chunk_id", "id", default=None)
idx = _get(ch, "idx", "index", default=i)
text = _get(ch, "text", "content", "body", "chunk_text", default=None)
char_start = _get(ch, "char_start", "start", "begin", default=None)
char_end = _get(ch, "char_end", "end", "stop", default=None)
token_count = _get(ch, "token_count", "tokens", "n_tokens", default=None)
section = _get(ch, "section", "heading", default=None)
section_path = _get(ch, "section_path", "hpath", default=None)
# tolerate missing/variant fields
text = _get(ch, "text", "content", "body", "value", default="")
idx = _get(ch, "index", "idx", default=i)
start = _get(ch, "start", "start_char", "offset_start", "char_start", default=None)
end = _get(ch, "end", "end_char", "offset_end", "char_end", default=None)
tokens = _get(ch, "n_tokens", "tokens", "token_count", default=None)
section = _get(ch, "section", "section_title", "heading", default=None)
section_level = _get(ch, "section_level", "heading_level", default=None)
payload = _clean({
payload = {
"note_id": note_id,
"title": title,
"type": ntype,
"path": rel_path or abs_path,
"chunk_profile": chunk_profile,
"retriever_weight": retriever_weight,
"chunk_id": chunk_id or (f"{note_id}#ch{idx}" if note_id is not None else None),
"chunk_index": idx,
"path": rel_path or raw_path,
"chunk_index": int(idx) if isinstance(idx, (int, float)) else i,
"text": text,
"char_start": char_start,
"char_end": char_end,
"token_count": token_count,
"start": start,
"end": end,
"tokens": tokens,
"section": section,
"section_path": section_path,
})
# If the chunk object carries an existing mapping of extra metadata, preserve it.
if isinstance(ch, dict):
# Avoid overwriting the fields we already normalized
extras = {k: v for k, v in ch.items() if k not in payload and v is not None}
if extras:
payload.update(extras)
out.append(payload)
"section_level": section_level,
"chunk_profile": chunk_profile,
"tags": tags,
"retriever_weight": float(rw),
}
out.append(_sanitize(payload))
return out

View File

@ -1,133 +1,181 @@
"""
note_payload.py
Version: 1.4.2
Description:
Builds the payload for a *note* document destined for the Qdrant "notes" collection.
- Defensive against both dict-like and attribute-like "ParsedNote" inputs.
- Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults).
- Ensures "retriever_weight" is always present in the payload (float), resolved as:
kwarg retriever_weight > frontmatter.retriever_weight > frontmatter.retriever.weight >
type_defaults[type].retriever_weight > 1.0
- Preserves common metadata fields expected downstream.
Public API:
make_note_payload(parsed_note, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__)
"""
note_payload.py Mindnet core payload builder (v0.5, 2025-11-08)
Purpose
-------
Builds a **JSON-serializable** payload dict for a single note to be stored in
the `<prefix>_notes` collection. The function is defensive against both
attribute- and dict-like ParsedNote inputs, unknown kwargs, and ensures
`retriever_weight` is always present as a float.
Key guarantees
--------------
- Accepts extra positional/keyword args without error (for importer compatibility).
- Tolerant of attribute vs dict access for ParsedNote.
- Always sets 'retriever_weight' in the payload (float).
- Never includes non-serializable objects (functions, PosixPath, datetime, etc.).
Public API
----------
make_note_payload(parsed_note, *args, retriever_weight=None, vault_root=None, type_defaults=None, **kwargs) -> dict
"""
from typing import Any, Dict, Optional
from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, Optional, Union, Iterable, Mapping
import datetime, math
def _as_dict(obj: Any) -> Dict[str, Any]:
if isinstance(obj, dict):
return obj
# Try common attribute names to build a dict view
out = {}
for key in (
"frontmatter", "fm", "meta",
"note_id", "id",
"title", "type", "tags", "aliases",
"created", "updated", "date",
"abs_path", "path", "rel_path",
):
if hasattr(obj, key):
out[key] = getattr(obj, key)
return out
Json = Union[None, bool, int, float, str, list, dict]
def _get(obj: Any, *keys: str, default: Any=None):
"""Get first existing key/attribute from obj."""
if isinstance(obj, dict):
for k in keys:
if k in obj:
return obj[k]
return default
# attribute access
for k in keys:
if hasattr(obj, k):
return getattr(obj, k)
# ------------------------- helpers -------------------------
def _is_mapping(x: Any) -> bool:
return isinstance(x, Mapping)
def _get(obj: Any, *names: str, default: Any=None) -> Any:
"""Try attribute lookup, then mapping (dict) lookup, first hit wins."""
for n in names:
if hasattr(obj, n):
try:
return getattr(obj, n)
except Exception:
pass
if _is_mapping(obj) and n in obj:
try:
return obj[n]
except Exception:
pass
return default
def _as_list(val):
if val is None:
def _to_float(x: Any, default: float=1.0) -> float:
if x is None:
return float(default)
if isinstance(x, (int, float)) and math.isfinite(x):
return float(x)
try:
s = str(x).strip().replace(',', '.')
return float(s)
except Exception:
return float(default)
def _ensure_list(x: Any) -> list:
if x is None:
return []
if isinstance(x, list):
return x
if isinstance(x, (set, tuple)):
return list(x)
return [x]
def _sanitize(obj: Any) -> Json:
"""Recursively convert to JSON-serializable primitives; drop callables."""
if obj is None or isinstance(obj, (bool, int, float, str)):
return obj
if callable(obj):
return None
if isinstance(val, (list, tuple)):
return list(val)
if isinstance(val, str):
return [val]
try:
return list(val) # best-effort
except Exception:
return [val]
def _coerce_float(val: Any, default: float) -> float:
if val is None:
return float(default)
try:
return float(val)
except Exception:
return float(default)
def _clean(d: Dict[str, Any]) -> Dict[str, Any]:
return {k: v for k, v in d.items() if v is not None}
def make_note_payload(
parsed_note: Any,
*_, # ignore legacy extra positional args for backward compatibility
retriever_weight: Optional[float] = None,
vault_root: Optional[str] = None,
type_defaults: Optional[Dict[str, Dict[str, Any]]] = None,
**__, # ignore any unexpected kwargs
) -> Dict[str, Any]:
nd = _as_dict(parsed_note)
fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {}
note_id = _get(nd, "note_id", "id") or fm.get("id")
title = _get(nd, "title") or fm.get("title")
ntype = _get(nd, "type") or fm.get("type") or "concept"
# Path handling
abs_path = _get(nd, "abs_path", "path")
rel_path = _get(nd, "rel_path")
if vault_root and abs_path and not rel_path:
if isinstance(obj, (list, tuple, set)):
return [_sanitize(v) for v in obj]
if isinstance(obj, dict):
out = {}
for k, v in obj.items():
if callable(v):
continue
out[str(k)] = _sanitize(v)
return out
if isinstance(obj, Path):
return str(obj)
if isinstance(obj, datetime.datetime):
return obj.isoformat()
if hasattr(obj, "__str__"):
try:
from pathlib import Path
rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve()))
return str(obj)
except Exception:
rel_path = _get(nd, "path") or abs_path
return None
return None
# Tags / aliases
tags = _as_list(_get(nd, "tags") or fm.get("tags"))
aliases = _as_list(_get(nd, "aliases") or fm.get("aliases"))
def _compute_retriever_weight(explicit: Any, frontmatter: dict, type_defaults: Optional[dict], note_type: Optional[str]) -> float:
if explicit is not None:
return _to_float(explicit, 1.0)
# common frontmatter keys
for key in ("retriever_weight", "retriever.weight", "retrieverWeight"):
if key in frontmatter:
return _to_float(frontmatter.get(key), 1.0)
# type defaults map like: {"concept": {"retriever_weight": 0.9}, ...}
if type_defaults and note_type:
tdef = type_defaults.get(note_type) or {}
for key in ("retriever_weight", "retriever.weight", "retrieverWeight"):
if key in tdef:
return _to_float(tdef.get(key), 1.0)
return 1.0
# Created/Updated
created = _get(nd, "created", "date") or fm.get("created") or fm.get("date")
updated = _get(nd, "updated") or fm.get("updated")
# ------------------------- public API -------------------------
# Chunk profile (effective)
chunk_profile = fm.get("chunk_profile")
if not chunk_profile and type_defaults and ntype in type_defaults:
chunk_profile = type_defaults[ntype].get("chunk_profile")
def make_note_payload(parsed_note: Any,
*args,
retriever_weight: Optional[float]=None,
vault_root: Optional[str]=None,
type_defaults: Optional[dict]=None,
**kwargs) -> Dict[str, Json]:
"""
Build a JSON-safe payload for the note.
# Retriever weight resolution (ensures it is present)
if retriever_weight is None:
retriever_weight = (
fm.get("retriever_weight")
or (fm.get("retriever", {}) or {}).get("weight")
)
if retriever_weight is None and type_defaults and ntype in type_defaults:
retriever_weight = type_defaults[ntype].get("retriever_weight")
Parameters (tolerant; unknown args are ignored)
----------
parsed_note : object or dict
Expected fields/keys (best-effort): note_id/id, title, type, path/rel_path,
frontmatter, tags, aliases, chunk_profile.
retriever_weight : float|None
Overrides frontmatter/type-defaults if provided.
vault_root : str|None
Optional; used to produce a normalized relative path.
type_defaults : dict|None
Optional map for per-type defaults.
retriever_weight = _coerce_float(retriever_weight, default=1.0)
Returns
-------
dict suitable for Qdrant payload
"""
fm = _get(parsed_note, "frontmatter", "fm", default={})
if not isinstance(fm, dict):
fm = {}
payload = _clean({
note_id = _get(parsed_note, "note_id", "id", default=fm.get("id"))
title = _get(parsed_note, "title", default=fm.get("title"))
ntype = _get(parsed_note, "type", default=fm.get("type"))
raw_path = _get(parsed_note, "path", "rel_path", "relpath", default=fm.get("path"))
tags = _ensure_list(_get(parsed_note, "tags", default=fm.get("tags")))
aliases = _ensure_list(_get(parsed_note, "aliases", default=fm.get("aliases")))
chunk_profile = _get(parsed_note, "chunk_profile", "profile", default=fm.get("chunk_profile"))
created = _get(parsed_note, "created", default=fm.get("created"))
updated = _get(parsed_note, "updated", default=fm.get("updated"))
# normalize path relative to vault root if both available
rel_path = raw_path
if raw_path and vault_root:
try:
rel_path = str(Path(raw_path)).replace(str(Path(vault_root)), "").lstrip("/\\")
except Exception:
rel_path = str(raw_path)
rw = _compute_retriever_weight(retriever_weight, fm, type_defaults, ntype)
payload = {
"note_id": note_id,
"id": note_id, # keep both, many downstream tools expect 'id'
"title": title,
"type": ntype,
"path": rel_path or raw_path,
"tags": tags,
"aliases": aliases,
"chunk_profile": chunk_profile,
"created": created,
"updated": updated,
"path": rel_path or abs_path,
"chunk_profile": chunk_profile,
"retriever_weight": retriever_weight,
})
return payload
"retriever_weight": float(rw),
}
# Add selected FM fields if present (safe subset)
for key in ("status", "priority", "owner", "source"):
if key in fm:
payload[key] = fm.get(key)
return _sanitize(payload)