Dateien nach "app/core" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
a686bdbeaf
commit
290b271cf6
|
|
@ -1,162 +1,149 @@
|
|||
"""
|
||||
chunk_payload.py — mindnet core payload builders
|
||||
Version: 1.3.1 (2025-11-08)
|
||||
chunk_payload.py
|
||||
Version: 1.4.2
|
||||
Description:
|
||||
Builds the payloads for *chunks* of a note destined for the Qdrant "chunks" collection.
|
||||
- Defensive against both dict-like and attribute-like chunk objects.
|
||||
- Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults).
|
||||
- Ensures "retriever_weight" is present in every chunk payload, derived from the note.
|
||||
- Preserves common chunk metadata (idx, offsets, tokens, section info, etc.).
|
||||
- Tolerates legacy third positional parameter.
|
||||
|
||||
Purpose
|
||||
-------
|
||||
Build robust chunk payloads for Qdrant upserts.
|
||||
This function is intentionally flexible about its signature to remain
|
||||
compatible with different callers.
|
||||
|
||||
Contract
|
||||
--------
|
||||
make_chunk_payloads(note, chunks, *args, **kwargs) -> List[Dict[str, Any]]
|
||||
|
||||
Each returned item contains at least:
|
||||
- note_id (str)
|
||||
- title (str)
|
||||
- type (str)
|
||||
- path (str or None)
|
||||
- tags (List[str])
|
||||
- chunk_index (int)
|
||||
- text (str)
|
||||
- retriever_weight (float or None) # if available
|
||||
Public API:
|
||||
make_chunk_payloads(parsed_note, chunks, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Union
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
|
||||
def _as_dict(obj: Any) -> Dict[str, Any]:
|
||||
if isinstance(obj, dict):
|
||||
return obj
|
||||
out = {}
|
||||
for key in (
|
||||
"frontmatter", "fm", "meta",
|
||||
"note_id", "id",
|
||||
"title", "type", "tags", "aliases",
|
||||
"created", "updated", "date",
|
||||
"abs_path", "path", "rel_path",
|
||||
):
|
||||
if hasattr(obj, key):
|
||||
out[key] = getattr(obj, key)
|
||||
return out
|
||||
|
||||
def _get(obj: Any, key: str, default: Any = None) -> Any:
|
||||
if obj is None:
|
||||
def _get(obj: Any, *keys: str, default: Any=None):
|
||||
if isinstance(obj, dict):
|
||||
for k in keys:
|
||||
if k in obj:
|
||||
return obj[k]
|
||||
return default
|
||||
if isinstance(obj, Mapping):
|
||||
return obj.get(key, default)
|
||||
return getattr(obj, key, default)
|
||||
for k in keys:
|
||||
if hasattr(obj, k):
|
||||
return getattr(obj, k)
|
||||
return default
|
||||
|
||||
|
||||
def _get_frontmatter(note: Any) -> Mapping[str, Any]:
|
||||
fm = _get(note, "frontmatter", {})
|
||||
if isinstance(fm, Mapping):
|
||||
return fm
|
||||
return {}
|
||||
|
||||
|
||||
def _resolve_retriever_weight(explicit: Any, fm: Mapping[str, Any]) -> Optional[float]:
|
||||
def to_float(v: Any) -> Optional[float]:
|
||||
try:
|
||||
if v is None:
|
||||
return None
|
||||
return float(v)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if explicit is not None:
|
||||
return to_float(explicit)
|
||||
|
||||
if "retriever_weight" in fm:
|
||||
return to_float(fm.get("retriever_weight"))
|
||||
|
||||
retr = fm.get("retriever")
|
||||
if isinstance(retr, Mapping) and "weight" in retr:
|
||||
return to_float(retr.get("weight"))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _to_rel_path(abs_path: Optional[Union[str, Path]], vault_root: Optional[Union[str, Path]]) -> Optional[str]:
|
||||
if abs_path is None:
|
||||
def _as_list(val):
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, (list, tuple)):
|
||||
return list(val)
|
||||
if isinstance(val, str):
|
||||
return [val]
|
||||
try:
|
||||
p = Path(abs_path)
|
||||
if vault_root:
|
||||
try:
|
||||
rp = p.relative_to(Path(vault_root))
|
||||
return str(rp)
|
||||
except Exception:
|
||||
return str(p)
|
||||
return str(p)
|
||||
return list(val)
|
||||
except Exception:
|
||||
return str(abs_path)
|
||||
return [val]
|
||||
|
||||
|
||||
def _coerce_chunks(chunks_obj: Any) -> List[Any]:
|
||||
"""Accept lists of dicts/objects or generators; coerce to list safely."""
|
||||
if chunks_obj is None:
|
||||
return []
|
||||
if isinstance(chunks_obj, list):
|
||||
return chunks_obj
|
||||
def _coerce_float(val: Any, default: float) -> float:
|
||||
if val is None:
|
||||
return float(default)
|
||||
try:
|
||||
return list(chunks_obj)
|
||||
return float(val)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _get_chunk_text(c: Any) -> str:
|
||||
for key in ("text", "chunk", "body", "content"):
|
||||
v = _get(c, key)
|
||||
if isinstance(v, str) and v.strip():
|
||||
return v
|
||||
# last resort: string repr
|
||||
return str(c) if c is not None else ""
|
||||
return float(default)
|
||||
|
||||
def _clean(d: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return {k: v for k, v in d.items() if v is not None}
|
||||
|
||||
def make_chunk_payloads(
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
parsed_note: Any,
|
||||
chunks: Iterable[Any],
|
||||
*_, # legacy extra positional parameters tolerated
|
||||
retriever_weight: Optional[float] = None,
|
||||
vault_root: Optional[str] = None,
|
||||
type_defaults: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
**__, # ignore unexpected kwargs
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Flexible signature for backward/forward compatibility.
|
||||
Expected positional args:
|
||||
args[0] -> note (ParsedNote or Mapping)
|
||||
args[1] -> chunks (Iterable)
|
||||
args[2] -> (optional) config/ignored
|
||||
Recognized kwargs:
|
||||
- vault_root: base path for relative paths (optional)
|
||||
- retriever_weight: explicit override (optional)
|
||||
"""
|
||||
if not args:
|
||||
raise TypeError("make_chunk_payloads(note, chunks, *_) requires at least (note, chunks).")
|
||||
nd = _as_dict(parsed_note)
|
||||
fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {}
|
||||
|
||||
note = args[0]
|
||||
chunks = args[1] if len(args) > 1 else kwargs.get("chunks")
|
||||
chunks_list = _coerce_chunks(chunks)
|
||||
note_id = _get(nd, "note_id", "id") or fm.get("id")
|
||||
title = _get(nd, "title") or fm.get("title")
|
||||
ntype = _get(nd, "type") or fm.get("type") or "concept"
|
||||
|
||||
vault_root = kwargs.get("vault_root")
|
||||
explicit_weight = kwargs.get("retriever_weight")
|
||||
# Effective path for source reference
|
||||
abs_path = _get(nd, "abs_path", "path")
|
||||
rel_path = _get(nd, "rel_path")
|
||||
if vault_root and abs_path and not rel_path:
|
||||
try:
|
||||
from pathlib import Path
|
||||
rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve()))
|
||||
except Exception:
|
||||
rel_path = _get(nd, "path") or abs_path
|
||||
|
||||
fm = _get_frontmatter(note)
|
||||
# Effective chunk_profile
|
||||
chunk_profile = fm.get("chunk_profile")
|
||||
if not chunk_profile and type_defaults and ntype in type_defaults:
|
||||
chunk_profile = type_defaults[ntype].get("chunk_profile")
|
||||
|
||||
note_id = _get(note, "note_id") or _get(note, "id") or fm.get("id")
|
||||
title = _get(note, "title") or fm.get("title")
|
||||
ntype = _get(note, "type") or fm.get("type")
|
||||
tags = _get(note, "tags") or fm.get("tags") or []
|
||||
if not isinstance(tags, list):
|
||||
tags = list(tags) if tags else []
|
||||
|
||||
path_val = _get(note, "path") or _get(note, "abs_path") or fm.get("path")
|
||||
rweight = _resolve_retriever_weight(explicit_weight, fm)
|
||||
|
||||
base = {
|
||||
"note_id": note_id,
|
||||
"title": title,
|
||||
"type": ntype,
|
||||
"tags": tags,
|
||||
"path": _to_rel_path(path_val, vault_root),
|
||||
"retriever_weight": rweight,
|
||||
}
|
||||
|
||||
payloads: List[Dict[str, Any]] = []
|
||||
for idx, ch in enumerate(chunks_list):
|
||||
text = _get_chunk_text(ch)
|
||||
item = dict(base)
|
||||
item.update(
|
||||
{
|
||||
"chunk_index": idx,
|
||||
"text": text,
|
||||
}
|
||||
# Resolve retriever_weight once at note level, apply to all chunks
|
||||
if retriever_weight is None:
|
||||
retriever_weight = (
|
||||
fm.get("retriever_weight")
|
||||
or (fm.get("retriever", {}) or {}).get("weight")
|
||||
)
|
||||
payloads.append(item)
|
||||
if retriever_weight is None and type_defaults and ntype in type_defaults:
|
||||
retriever_weight = type_defaults[ntype].get("retriever_weight")
|
||||
|
||||
return payloads
|
||||
retriever_weight = _coerce_float(retriever_weight, default=1.0)
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
for i, ch in enumerate(chunks):
|
||||
cd = ch if isinstance(ch, dict) else {}
|
||||
# Basic fields with many aliases
|
||||
chunk_id = _get(ch, "chunk_id", "id", default=None)
|
||||
idx = _get(ch, "idx", "index", default=i)
|
||||
text = _get(ch, "text", "content", "body", "chunk_text", default=None)
|
||||
char_start = _get(ch, "char_start", "start", "begin", default=None)
|
||||
char_end = _get(ch, "char_end", "end", "stop", default=None)
|
||||
token_count = _get(ch, "token_count", "tokens", "n_tokens", default=None)
|
||||
section = _get(ch, "section", "heading", default=None)
|
||||
section_path = _get(ch, "section_path", "hpath", default=None)
|
||||
|
||||
payload = _clean({
|
||||
"note_id": note_id,
|
||||
"title": title,
|
||||
"type": ntype,
|
||||
"path": rel_path or abs_path,
|
||||
"chunk_profile": chunk_profile,
|
||||
"retriever_weight": retriever_weight,
|
||||
|
||||
"chunk_id": chunk_id or (f"{note_id}#ch{idx}" if note_id is not None else None),
|
||||
"chunk_index": idx,
|
||||
"text": text,
|
||||
"char_start": char_start,
|
||||
"char_end": char_end,
|
||||
"token_count": token_count,
|
||||
"section": section,
|
||||
"section_path": section_path,
|
||||
})
|
||||
|
||||
# If the chunk object carries an existing mapping of extra metadata, preserve it.
|
||||
if isinstance(ch, dict):
|
||||
# Avoid overwriting the fields we already normalized
|
||||
extras = {k: v for k, v in ch.items() if k not in payload and v is not None}
|
||||
if extras:
|
||||
payload.update(extras)
|
||||
|
||||
out.append(payload)
|
||||
|
||||
return out
|
||||
|
|
|
|||
|
|
@ -1,139 +1,133 @@
|
|||
"""
|
||||
note_payload.py — mindnet core payload builders
|
||||
Version: 1.3.1 (2025-11-08)
|
||||
note_payload.py
|
||||
Version: 1.4.2
|
||||
Description:
|
||||
Builds the payload for a *note* document destined for the Qdrant "notes" collection.
|
||||
- Defensive against both dict-like and attribute-like "ParsedNote" inputs.
|
||||
- Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults).
|
||||
- Ensures "retriever_weight" is always present in the payload (float), resolved as:
|
||||
kwarg retriever_weight > frontmatter.retriever_weight > frontmatter.retriever.weight >
|
||||
type_defaults[type].retriever_weight > 1.0
|
||||
- Preserves common metadata fields expected downstream.
|
||||
|
||||
Purpose
|
||||
-------
|
||||
Build a robust, forward-compatible note payload for Qdrant upserts.
|
||||
This module is intentionally defensive:
|
||||
- Accepts both dict-like "parsed note" objects and dataclass/objects with attributes.
|
||||
- Tolerates extra kwargs from different callers (e.g., `vault_root`, `prefix`, etc.).
|
||||
- Ensures `retriever_weight` is resolved and present in the payload if available.
|
||||
|
||||
Contract
|
||||
--------
|
||||
make_note_payload(note, **kwargs) -> Dict[str, Any]
|
||||
|
||||
Expected minimal fields in returned payload:
|
||||
- note_id (str)
|
||||
- title (str)
|
||||
- type (str)
|
||||
- path (str or None) # relative to vault_root when provided
|
||||
- tags (List[str])
|
||||
- retriever_weight (float or None) # if available
|
||||
Public API:
|
||||
make_note_payload(parsed_note, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Mapping, Optional, Union
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
def _as_dict(obj: Any) -> Dict[str, Any]:
|
||||
if isinstance(obj, dict):
|
||||
return obj
|
||||
# Try common attribute names to build a dict view
|
||||
out = {}
|
||||
for key in (
|
||||
"frontmatter", "fm", "meta",
|
||||
"note_id", "id",
|
||||
"title", "type", "tags", "aliases",
|
||||
"created", "updated", "date",
|
||||
"abs_path", "path", "rel_path",
|
||||
):
|
||||
if hasattr(obj, key):
|
||||
out[key] = getattr(obj, key)
|
||||
return out
|
||||
|
||||
def _get(obj: Any, key: str, default: Any = None) -> Any:
|
||||
"""Try to read `key` from mapping or attribute; else default."""
|
||||
if obj is None:
|
||||
def _get(obj: Any, *keys: str, default: Any=None):
|
||||
"""Get first existing key/attribute from obj."""
|
||||
if isinstance(obj, dict):
|
||||
for k in keys:
|
||||
if k in obj:
|
||||
return obj[k]
|
||||
return default
|
||||
if isinstance(obj, Mapping):
|
||||
return obj.get(key, default)
|
||||
# attribute access
|
||||
return getattr(obj, key, default)
|
||||
for k in keys:
|
||||
if hasattr(obj, k):
|
||||
return getattr(obj, k)
|
||||
return default
|
||||
|
||||
|
||||
def _get_frontmatter(note: Any) -> Mapping[str, Any]:
|
||||
fm = _get(note, "frontmatter", {})
|
||||
if isinstance(fm, Mapping):
|
||||
return fm
|
||||
return {} # be safe
|
||||
|
||||
|
||||
def _resolve_retriever_weight(explicit: Any, fm: Mapping[str, Any]) -> Optional[float]:
|
||||
"""
|
||||
Priority:
|
||||
1) explicit kwarg retriever_weight
|
||||
2) frontmatter['retriever_weight']
|
||||
3) frontmatter['retriever']['weight']
|
||||
"""
|
||||
def to_float(v: Any) -> Optional[float]:
|
||||
try:
|
||||
if v is None:
|
||||
return None
|
||||
return float(v)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if explicit is not None:
|
||||
return to_float(explicit)
|
||||
|
||||
if "retriever_weight" in fm:
|
||||
return to_float(fm.get("retriever_weight"))
|
||||
|
||||
retr = fm.get("retriever")
|
||||
if isinstance(retr, Mapping) and "weight" in retr:
|
||||
return to_float(retr.get("weight"))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _to_rel_path(abs_path: Optional[Union[str, Path]], vault_root: Optional[Union[str, Path]]) -> Optional[str]:
|
||||
if abs_path is None:
|
||||
def _as_list(val):
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, (list, tuple)):
|
||||
return list(val)
|
||||
if isinstance(val, str):
|
||||
return [val]
|
||||
try:
|
||||
p = Path(abs_path)
|
||||
if vault_root:
|
||||
try:
|
||||
rp = p.relative_to(Path(vault_root))
|
||||
return str(rp)
|
||||
except Exception:
|
||||
return str(p)
|
||||
return str(p)
|
||||
return list(val) # best-effort
|
||||
except Exception:
|
||||
return str(abs_path)
|
||||
return [val]
|
||||
|
||||
def _coerce_float(val: Any, default: float) -> float:
|
||||
if val is None:
|
||||
return float(default)
|
||||
try:
|
||||
return float(val)
|
||||
except Exception:
|
||||
return float(default)
|
||||
|
||||
def _clean(d: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return {k: v for k, v in d.items() if v is not None}
|
||||
|
||||
def make_note_payload(
|
||||
note: Any,
|
||||
*args, # tolerate older/other callers
|
||||
**kwargs: Any,
|
||||
parsed_note: Any,
|
||||
*_, # ignore legacy extra positional args for backward compatibility
|
||||
retriever_weight: Optional[float] = None,
|
||||
vault_root: Optional[str] = None,
|
||||
type_defaults: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
**__, # ignore any unexpected kwargs
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Build a normalized note payload for Qdrant.
|
||||
Unknown kwargs are ignored to keep the function forward-compatible.
|
||||
nd = _as_dict(parsed_note)
|
||||
fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {}
|
||||
|
||||
Recognized kwargs:
|
||||
- vault_root: base path to make `path` relative (optional)
|
||||
- retriever_weight: explicit override (optional)
|
||||
"""
|
||||
vault_root = kwargs.get("vault_root")
|
||||
explicit_weight = kwargs.get("retriever_weight")
|
||||
note_id = _get(nd, "note_id", "id") or fm.get("id")
|
||||
title = _get(nd, "title") or fm.get("title")
|
||||
ntype = _get(nd, "type") or fm.get("type") or "concept"
|
||||
|
||||
fm = _get_frontmatter(note)
|
||||
# Path handling
|
||||
abs_path = _get(nd, "abs_path", "path")
|
||||
rel_path = _get(nd, "rel_path")
|
||||
if vault_root and abs_path and not rel_path:
|
||||
try:
|
||||
from pathlib import Path
|
||||
rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve()))
|
||||
except Exception:
|
||||
rel_path = _get(nd, "path") or abs_path
|
||||
|
||||
note_id = _get(note, "note_id") or _get(note, "id")
|
||||
if not note_id:
|
||||
# Try from frontmatter
|
||||
note_id = fm.get("id")
|
||||
# Tags / aliases
|
||||
tags = _as_list(_get(nd, "tags") or fm.get("tags"))
|
||||
aliases = _as_list(_get(nd, "aliases") or fm.get("aliases"))
|
||||
|
||||
title = _get(note, "title") or fm.get("title")
|
||||
ntype = _get(note, "type") or fm.get("type")
|
||||
# Created/Updated
|
||||
created = _get(nd, "created", "date") or fm.get("created") or fm.get("date")
|
||||
updated = _get(nd, "updated") or fm.get("updated")
|
||||
|
||||
tags = _get(note, "tags") or fm.get("tags") or []
|
||||
if not isinstance(tags, list):
|
||||
tags = list(tags) if tags else []
|
||||
# Chunk profile (effective)
|
||||
chunk_profile = fm.get("chunk_profile")
|
||||
if not chunk_profile and type_defaults and ntype in type_defaults:
|
||||
chunk_profile = type_defaults[ntype].get("chunk_profile")
|
||||
|
||||
path_val = _get(note, "path") or _get(note, "abs_path") or fm.get("path")
|
||||
# Retriever weight resolution (ensures it is present)
|
||||
if retriever_weight is None:
|
||||
retriever_weight = (
|
||||
fm.get("retriever_weight")
|
||||
or (fm.get("retriever", {}) or {}).get("weight")
|
||||
)
|
||||
if retriever_weight is None and type_defaults and ntype in type_defaults:
|
||||
retriever_weight = type_defaults[ntype].get("retriever_weight")
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
retriever_weight = _coerce_float(retriever_weight, default=1.0)
|
||||
|
||||
payload = _clean({
|
||||
"note_id": note_id,
|
||||
"id": note_id, # keep both, many downstream tools expect 'id'
|
||||
"title": title,
|
||||
"type": ntype,
|
||||
"tags": tags,
|
||||
"path": _to_rel_path(path_val, vault_root),
|
||||
"retriever_weight": _resolve_retriever_weight(explicit_weight, fm),
|
||||
}
|
||||
|
||||
# Also surface explicit frontmatter fields (non-conflicting) if present
|
||||
for k in ("status", "created", "updated"):
|
||||
v = fm.get(k)
|
||||
if v is not None and k not in payload:
|
||||
payload[k] = v
|
||||
|
||||
"aliases": aliases,
|
||||
"created": created,
|
||||
"updated": updated,
|
||||
"path": rel_path or abs_path,
|
||||
"chunk_profile": chunk_profile,
|
||||
"retriever_weight": retriever_weight,
|
||||
})
|
||||
return payload
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user