Dateien nach "app/core" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-11-08 21:36:53 +01:00
parent a686bdbeaf
commit 290b271cf6
2 changed files with 230 additions and 249 deletions

View File

@ -1,162 +1,149 @@
"""
chunk_payload.py mindnet core payload builders
Version: 1.3.1 (2025-11-08)
chunk_payload.py
Version: 1.4.2
Description:
Builds the payloads for *chunks* of a note destined for the Qdrant "chunks" collection.
- Defensive against both dict-like and attribute-like chunk objects.
- Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults).
- Ensures "retriever_weight" is present in every chunk payload, derived from the note.
- Preserves common chunk metadata (idx, offsets, tokens, section info, etc.).
- Tolerates legacy third positional parameter.
Purpose
-------
Build robust chunk payloads for Qdrant upserts.
This function is intentionally flexible about its signature to remain
compatible with different callers.
Contract
--------
make_chunk_payloads(note, chunks, *args, **kwargs) -> List[Dict[str, Any]]
Each returned item contains at least:
- note_id (str)
- title (str)
- type (str)
- path (str or None)
- tags (List[str])
- chunk_index (int)
- text (str)
- retriever_weight (float or None) # if available
Public API:
make_chunk_payloads(parsed_note, chunks, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__)
"""
from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Union
from typing import Any, Dict, Iterable, List, Optional
def _as_dict(obj: Any) -> Dict[str, Any]:
if isinstance(obj, dict):
return obj
out = {}
for key in (
"frontmatter", "fm", "meta",
"note_id", "id",
"title", "type", "tags", "aliases",
"created", "updated", "date",
"abs_path", "path", "rel_path",
):
if hasattr(obj, key):
out[key] = getattr(obj, key)
return out
def _get(obj: Any, key: str, default: Any = None) -> Any:
if obj is None:
def _get(obj: Any, *keys: str, default: Any=None):
if isinstance(obj, dict):
for k in keys:
if k in obj:
return obj[k]
return default
if isinstance(obj, Mapping):
return obj.get(key, default)
return getattr(obj, key, default)
for k in keys:
if hasattr(obj, k):
return getattr(obj, k)
return default
def _get_frontmatter(note: Any) -> Mapping[str, Any]:
fm = _get(note, "frontmatter", {})
if isinstance(fm, Mapping):
return fm
return {}
def _resolve_retriever_weight(explicit: Any, fm: Mapping[str, Any]) -> Optional[float]:
def to_float(v: Any) -> Optional[float]:
try:
if v is None:
return None
return float(v)
except Exception:
return None
if explicit is not None:
return to_float(explicit)
if "retriever_weight" in fm:
return to_float(fm.get("retriever_weight"))
retr = fm.get("retriever")
if isinstance(retr, Mapping) and "weight" in retr:
return to_float(retr.get("weight"))
return None
def _to_rel_path(abs_path: Optional[Union[str, Path]], vault_root: Optional[Union[str, Path]]) -> Optional[str]:
if abs_path is None:
def _as_list(val):
if val is None:
return None
if isinstance(val, (list, tuple)):
return list(val)
if isinstance(val, str):
return [val]
try:
p = Path(abs_path)
if vault_root:
try:
rp = p.relative_to(Path(vault_root))
return str(rp)
except Exception:
return str(p)
return str(p)
return list(val)
except Exception:
return str(abs_path)
return [val]
def _coerce_chunks(chunks_obj: Any) -> List[Any]:
"""Accept lists of dicts/objects or generators; coerce to list safely."""
if chunks_obj is None:
return []
if isinstance(chunks_obj, list):
return chunks_obj
def _coerce_float(val: Any, default: float) -> float:
if val is None:
return float(default)
try:
return list(chunks_obj)
return float(val)
except Exception:
return []
def _get_chunk_text(c: Any) -> str:
for key in ("text", "chunk", "body", "content"):
v = _get(c, key)
if isinstance(v, str) and v.strip():
return v
# last resort: string repr
return str(c) if c is not None else ""
return float(default)
def _clean(d: Dict[str, Any]) -> Dict[str, Any]:
return {k: v for k, v in d.items() if v is not None}
def make_chunk_payloads(
*args: Any,
**kwargs: Any,
parsed_note: Any,
chunks: Iterable[Any],
*_, # legacy extra positional parameters tolerated
retriever_weight: Optional[float] = None,
vault_root: Optional[str] = None,
type_defaults: Optional[Dict[str, Dict[str, Any]]] = None,
**__, # ignore unexpected kwargs
) -> List[Dict[str, Any]]:
"""
Flexible signature for backward/forward compatibility.
Expected positional args:
args[0] -> note (ParsedNote or Mapping)
args[1] -> chunks (Iterable)
args[2] -> (optional) config/ignored
Recognized kwargs:
- vault_root: base path for relative paths (optional)
- retriever_weight: explicit override (optional)
"""
if not args:
raise TypeError("make_chunk_payloads(note, chunks, *_) requires at least (note, chunks).")
nd = _as_dict(parsed_note)
fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {}
note = args[0]
chunks = args[1] if len(args) > 1 else kwargs.get("chunks")
chunks_list = _coerce_chunks(chunks)
note_id = _get(nd, "note_id", "id") or fm.get("id")
title = _get(nd, "title") or fm.get("title")
ntype = _get(nd, "type") or fm.get("type") or "concept"
vault_root = kwargs.get("vault_root")
explicit_weight = kwargs.get("retriever_weight")
# Effective path for source reference
abs_path = _get(nd, "abs_path", "path")
rel_path = _get(nd, "rel_path")
if vault_root and abs_path and not rel_path:
try:
from pathlib import Path
rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve()))
except Exception:
rel_path = _get(nd, "path") or abs_path
fm = _get_frontmatter(note)
# Effective chunk_profile
chunk_profile = fm.get("chunk_profile")
if not chunk_profile and type_defaults and ntype in type_defaults:
chunk_profile = type_defaults[ntype].get("chunk_profile")
note_id = _get(note, "note_id") or _get(note, "id") or fm.get("id")
title = _get(note, "title") or fm.get("title")
ntype = _get(note, "type") or fm.get("type")
tags = _get(note, "tags") or fm.get("tags") or []
if not isinstance(tags, list):
tags = list(tags) if tags else []
path_val = _get(note, "path") or _get(note, "abs_path") or fm.get("path")
rweight = _resolve_retriever_weight(explicit_weight, fm)
base = {
"note_id": note_id,
"title": title,
"type": ntype,
"tags": tags,
"path": _to_rel_path(path_val, vault_root),
"retriever_weight": rweight,
}
payloads: List[Dict[str, Any]] = []
for idx, ch in enumerate(chunks_list):
text = _get_chunk_text(ch)
item = dict(base)
item.update(
{
"chunk_index": idx,
"text": text,
}
# Resolve retriever_weight once at note level, apply to all chunks
if retriever_weight is None:
retriever_weight = (
fm.get("retriever_weight")
or (fm.get("retriever", {}) or {}).get("weight")
)
payloads.append(item)
if retriever_weight is None and type_defaults and ntype in type_defaults:
retriever_weight = type_defaults[ntype].get("retriever_weight")
return payloads
retriever_weight = _coerce_float(retriever_weight, default=1.0)
out: List[Dict[str, Any]] = []
for i, ch in enumerate(chunks):
cd = ch if isinstance(ch, dict) else {}
# Basic fields with many aliases
chunk_id = _get(ch, "chunk_id", "id", default=None)
idx = _get(ch, "idx", "index", default=i)
text = _get(ch, "text", "content", "body", "chunk_text", default=None)
char_start = _get(ch, "char_start", "start", "begin", default=None)
char_end = _get(ch, "char_end", "end", "stop", default=None)
token_count = _get(ch, "token_count", "tokens", "n_tokens", default=None)
section = _get(ch, "section", "heading", default=None)
section_path = _get(ch, "section_path", "hpath", default=None)
payload = _clean({
"note_id": note_id,
"title": title,
"type": ntype,
"path": rel_path or abs_path,
"chunk_profile": chunk_profile,
"retriever_weight": retriever_weight,
"chunk_id": chunk_id or (f"{note_id}#ch{idx}" if note_id is not None else None),
"chunk_index": idx,
"text": text,
"char_start": char_start,
"char_end": char_end,
"token_count": token_count,
"section": section,
"section_path": section_path,
})
# If the chunk object carries an existing mapping of extra metadata, preserve it.
if isinstance(ch, dict):
# Avoid overwriting the fields we already normalized
extras = {k: v for k, v in ch.items() if k not in payload and v is not None}
if extras:
payload.update(extras)
out.append(payload)
return out

View File

@ -1,139 +1,133 @@
"""
note_payload.py mindnet core payload builders
Version: 1.3.1 (2025-11-08)
note_payload.py
Version: 1.4.2
Description:
Builds the payload for a *note* document destined for the Qdrant "notes" collection.
- Defensive against both dict-like and attribute-like "ParsedNote" inputs.
- Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults).
- Ensures "retriever_weight" is always present in the payload (float), resolved as:
kwarg retriever_weight > frontmatter.retriever_weight > frontmatter.retriever.weight >
type_defaults[type].retriever_weight > 1.0
- Preserves common metadata fields expected downstream.
Purpose
-------
Build a robust, forward-compatible note payload for Qdrant upserts.
This module is intentionally defensive:
- Accepts both dict-like "parsed note" objects and dataclass/objects with attributes.
- Tolerates extra kwargs from different callers (e.g., `vault_root`, `prefix`, etc.).
- Ensures `retriever_weight` is resolved and present in the payload if available.
Contract
--------
make_note_payload(note, **kwargs) -> Dict[str, Any]
Expected minimal fields in returned payload:
- note_id (str)
- title (str)
- type (str)
- path (str or None) # relative to vault_root when provided
- tags (List[str])
- retriever_weight (float or None) # if available
Public API:
make_note_payload(parsed_note, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__)
"""
from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, Iterable, List, Mapping, Optional, Union
from typing import Any, Dict, Optional
def _as_dict(obj: Any) -> Dict[str, Any]:
if isinstance(obj, dict):
return obj
# Try common attribute names to build a dict view
out = {}
for key in (
"frontmatter", "fm", "meta",
"note_id", "id",
"title", "type", "tags", "aliases",
"created", "updated", "date",
"abs_path", "path", "rel_path",
):
if hasattr(obj, key):
out[key] = getattr(obj, key)
return out
def _get(obj: Any, key: str, default: Any = None) -> Any:
"""Try to read `key` from mapping or attribute; else default."""
if obj is None:
def _get(obj: Any, *keys: str, default: Any=None):
"""Get first existing key/attribute from obj."""
if isinstance(obj, dict):
for k in keys:
if k in obj:
return obj[k]
return default
if isinstance(obj, Mapping):
return obj.get(key, default)
# attribute access
return getattr(obj, key, default)
for k in keys:
if hasattr(obj, k):
return getattr(obj, k)
return default
def _get_frontmatter(note: Any) -> Mapping[str, Any]:
fm = _get(note, "frontmatter", {})
if isinstance(fm, Mapping):
return fm
return {} # be safe
def _resolve_retriever_weight(explicit: Any, fm: Mapping[str, Any]) -> Optional[float]:
"""
Priority:
1) explicit kwarg retriever_weight
2) frontmatter['retriever_weight']
3) frontmatter['retriever']['weight']
"""
def to_float(v: Any) -> Optional[float]:
try:
if v is None:
return None
return float(v)
except Exception:
return None
if explicit is not None:
return to_float(explicit)
if "retriever_weight" in fm:
return to_float(fm.get("retriever_weight"))
retr = fm.get("retriever")
if isinstance(retr, Mapping) and "weight" in retr:
return to_float(retr.get("weight"))
return None
def _to_rel_path(abs_path: Optional[Union[str, Path]], vault_root: Optional[Union[str, Path]]) -> Optional[str]:
if abs_path is None:
def _as_list(val):
if val is None:
return None
if isinstance(val, (list, tuple)):
return list(val)
if isinstance(val, str):
return [val]
try:
p = Path(abs_path)
if vault_root:
try:
rp = p.relative_to(Path(vault_root))
return str(rp)
except Exception:
return str(p)
return str(p)
return list(val) # best-effort
except Exception:
return str(abs_path)
return [val]
def _coerce_float(val: Any, default: float) -> float:
if val is None:
return float(default)
try:
return float(val)
except Exception:
return float(default)
def _clean(d: Dict[str, Any]) -> Dict[str, Any]:
return {k: v for k, v in d.items() if v is not None}
def make_note_payload(
note: Any,
*args, # tolerate older/other callers
**kwargs: Any,
parsed_note: Any,
*_, # ignore legacy extra positional args for backward compatibility
retriever_weight: Optional[float] = None,
vault_root: Optional[str] = None,
type_defaults: Optional[Dict[str, Dict[str, Any]]] = None,
**__, # ignore any unexpected kwargs
) -> Dict[str, Any]:
"""
Build a normalized note payload for Qdrant.
Unknown kwargs are ignored to keep the function forward-compatible.
nd = _as_dict(parsed_note)
fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {}
Recognized kwargs:
- vault_root: base path to make `path` relative (optional)
- retriever_weight: explicit override (optional)
"""
vault_root = kwargs.get("vault_root")
explicit_weight = kwargs.get("retriever_weight")
note_id = _get(nd, "note_id", "id") or fm.get("id")
title = _get(nd, "title") or fm.get("title")
ntype = _get(nd, "type") or fm.get("type") or "concept"
fm = _get_frontmatter(note)
# Path handling
abs_path = _get(nd, "abs_path", "path")
rel_path = _get(nd, "rel_path")
if vault_root and abs_path and not rel_path:
try:
from pathlib import Path
rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve()))
except Exception:
rel_path = _get(nd, "path") or abs_path
note_id = _get(note, "note_id") or _get(note, "id")
if not note_id:
# Try from frontmatter
note_id = fm.get("id")
# Tags / aliases
tags = _as_list(_get(nd, "tags") or fm.get("tags"))
aliases = _as_list(_get(nd, "aliases") or fm.get("aliases"))
title = _get(note, "title") or fm.get("title")
ntype = _get(note, "type") or fm.get("type")
# Created/Updated
created = _get(nd, "created", "date") or fm.get("created") or fm.get("date")
updated = _get(nd, "updated") or fm.get("updated")
tags = _get(note, "tags") or fm.get("tags") or []
if not isinstance(tags, list):
tags = list(tags) if tags else []
# Chunk profile (effective)
chunk_profile = fm.get("chunk_profile")
if not chunk_profile and type_defaults and ntype in type_defaults:
chunk_profile = type_defaults[ntype].get("chunk_profile")
path_val = _get(note, "path") or _get(note, "abs_path") or fm.get("path")
# Retriever weight resolution (ensures it is present)
if retriever_weight is None:
retriever_weight = (
fm.get("retriever_weight")
or (fm.get("retriever", {}) or {}).get("weight")
)
if retriever_weight is None and type_defaults and ntype in type_defaults:
retriever_weight = type_defaults[ntype].get("retriever_weight")
payload: Dict[str, Any] = {
retriever_weight = _coerce_float(retriever_weight, default=1.0)
payload = _clean({
"note_id": note_id,
"id": note_id, # keep both, many downstream tools expect 'id'
"title": title,
"type": ntype,
"tags": tags,
"path": _to_rel_path(path_val, vault_root),
"retriever_weight": _resolve_retriever_weight(explicit_weight, fm),
}
# Also surface explicit frontmatter fields (non-conflicting) if present
for k in ("status", "created", "updated"):
v = fm.get(k)
if v is not None and k not in payload:
payload[k] = v
"aliases": aliases,
"created": created,
"updated": updated,
"path": rel_path or abs_path,
"chunk_profile": chunk_profile,
"retriever_weight": retriever_weight,
})
return payload