Dateien nach "app/core" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-11-09 09:15:48 +01:00
parent 6dc37ccb66
commit bbd5a7fa48
2 changed files with 376 additions and 289 deletions

View File

@ -1,144 +1,180 @@
""" """
chunk_payload.py Mindnet payload helpers chunk_payload.py Mindnet payload builder (Chunks)
Version: 0.5.2 (generated 2025-11-08 21:03:48) Version: 1.3.0 (2025-11-09)
Purpose:
- Build CHUNK payloads list while preserving existing chunk fields (text, seq, etc.). Purpose
- Inject into *every* chunk: -------
* retriever_weight (resolved like note payload) Build Qdrant-compatible JSON payloads for *chunks* of a parsed note.
* chunk_profile (resolved like note payload) Tolerant to different call signatures and accepts both dict-like and object-like inputs.
Resolution order identical to note_payload.make_note_payload.
Signature tolerant to match existing importers. Key features
------------
- Reads type defaults from `config/config.yaml` or `config/types.yaml` (same schema).
- Resolves fields with precedence:
Frontmatter > type-defaults > ENV > fallback.
- Sets per chunk:
* `note_id`, `note_title`, `type`
* `retriever_weight` (float)
* `chunk_profile` (short|medium|long)
* `text` (never empty: falls back to whole note body/text)
* `order`, `section`, `start`, `end` (if available)
- Backwards-compatible signature: accepts **kwargs to swallow unknown args.
Input
-----
`parsed_note` may be:
- dict with keys: id, title, body/text, chunks(list), frontmatter(dict), type
- object with equivalent attributes
Each chunk may be dict-like or object-like with keys/attrs such as:
id, text, order, section, start, end
""" """
from __future__ import annotations from __future__ import annotations
from typing import Any, Dict, List, Optional, Union
from pathlib import Path
import os import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
try: try:
import yaml # type: ignore import yaml # type: ignore
except Exception: # pragma: no cover except Exception: # pragma: no cover
yaml = None # will skip YAML loading if unavailable yaml = None
def _get(obj: Any, key: str, default: Any = None) -> Any:
def _coerce_mapping(obj: Any) -> Dict[str, Any]:
if obj is None:
return {{}}
if isinstance(obj, dict): if isinstance(obj, dict):
return dict(obj) return obj.get(key, default)
out: Dict[str, Any] = {{}} return getattr(obj, key, default)
if hasattr(obj, "__dict__"):
out.update(getattr(obj, "__dict__"))
for k in ("id","note_id","title","type","path","source_path","frontmatter"):
if hasattr(obj, k) and k not in out:
out[k] = getattr(obj, k)
return out
def _frontmatter(obj: Any) -> Dict[str, Any]:
fm = _get(obj, "frontmatter", {}) or {}
return fm if isinstance(fm, dict) else {}
def _coerce_chunk_dict(obj: Any) -> Dict[str, Any]: def _coerce_float(val: Any, default: float) -> float:
if isinstance(obj, dict):
return dict(obj)
d = {{}}
# common attributes for a chunk object
for k in ("chunk_id","id","note_id","seq","start","end","text","title","type","source_path"):
if hasattr(obj, k):
d[k] = getattr(obj, k)
if hasattr(obj, "__dict__"):
for k,v in obj.__dict__.items():
d.setdefault(k, v)
return d
def _get_frontmatter(parsed: Dict[str, Any]) -> Dict[str, Any]:
fm = parsed.get("frontmatter")
return dict(fm) if isinstance(fm, dict) else {{}}
def _load_types_from_yaml(types_file: Optional[Union[str, Path]]) -> Dict[str, Any]:
if types_file is None:
for cand in (Path("config/types.yaml"), Path("config/types.yml"), Path("config.yaml"), Path("config.yml")):
if cand.exists():
types_file = cand
break
if types_file is None or yaml is None:
return {{}}
p = Path(types_file)
if not p.exists():
return {{}}
try: try:
data = yaml.safe_load(p.read_text(encoding="utf-8")) if val is None:
if not isinstance(data, dict): return default
return {{}} if isinstance(val, (int, float)):
if "types" in data and isinstance(data["types"], dict): return float(val)
return dict(data["types"]) if isinstance(val, str) and val.strip():
return data return float(val.strip())
except Exception: except Exception:
return {{}} pass
return default
def _normalize_chunk_profile(val: Any, fallback: str = "medium") -> str:
def _resolve_type_defaults(note_type: Optional[str], types: Optional[Dict[str,Any]]) -> Dict[str, Any]: if not isinstance(val, str):
if not note_type or not types or not isinstance(types, dict):
return {{}}
block = types.get(note_type)
return dict(block) if isinstance(block, dict) else {{}}
def _to_float(val: Any, fallback: float) -> float:
if val is None:
return fallback
try:
return float(val)
except Exception:
return fallback return fallback
v = val.strip().lower()
if v in {"short", "medium", "long"}:
return v
return fallback
def _safe_text(s: Any) -> str:
if s is None:
return ""
if isinstance(s, str):
return s
return str(s)
def _first_nonempty(*vals): def _load_types_config(
for v in vals: explicit_config: Optional[Dict[str, Any]] = None,
if v is not None: search_root: Union[str, Path, None] = None,
if isinstance(v, str) and v.strip() == "": ) -> Dict[str, Any]:
continue if explicit_config and isinstance(explicit_config, dict):
return v if "types" in explicit_config and isinstance(explicit_config["types"], dict):
return None return explicit_config
if yaml is None:
return {"types": {}}
candidates: List[Path] = []
root = Path(search_root) if search_root else Path.cwd()
candidates.append(root / "config" / "config.yaml")
candidates.append(root / "config" / "types.yaml")
candidates.append(Path.cwd() / "config" / "config.yaml")
candidates.append(Path.cwd() / "config" / "types.yaml")
for p in candidates:
try:
if p.exists():
import yaml as _y
with p.open("r", encoding="utf-8") as f:
loaded = _y.safe_load(f) or {}
if isinstance(loaded, dict) and isinstance(loaded.get("types"), dict):
return {"types": loaded["types"]}
except Exception:
continue
return {"types": {}}
def _type_defaults(note_type: str, cfg: Dict[str, Any]) -> Dict[str, Any]:
return (cfg.get("types") or {}).get(note_type, {}) if isinstance(cfg, dict) else {}
def make_chunk_payloads(parsed_note: Any, chunks: List[Any], **kwargs) -> List[Dict[str, Any]]: def make_chunk_payloads(
parsed = _coerce_mapping(parsed_note) parsed_note: Any,
fm = _get_frontmatter(parsed) config: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Dict[str, Any]]:
search_root = kwargs.get("search_root")
fm = _frontmatter(parsed_note)
note_type = fm.get("type") or _get(parsed_note, "type") or "concept"
note_type = str(note_type).strip().lower()
# external sources cfg = _load_types_config(config, search_root)
types_registry = kwargs.get("types") or kwargs.get("types_registry") defaults = _type_defaults(note_type, cfg)
types_from_yaml = _load_types_from_yaml(kwargs.get("types_file"))
types_all: Dict[str, Any] = types_registry if isinstance(types_registry, dict) else types_from_yaml
note_type: Optional[str] = _first_nonempty(parsed.get("type"), fm.get("type")) # Resolve retriever_weight: FM > type-defaults > ENV > 1.0
type_defaults = _resolve_type_defaults(note_type, types_all) rw = fm.get("retriever_weight")
if rw is None:
rw = defaults.get("retriever_weight")
if rw is None:
env_rw = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT")
rw = _coerce_float(env_rw, 1.0)
else:
rw = _coerce_float(rw, 1.0)
env_default = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT") # Resolve chunk_profile: FM > type-defaults > ENV > medium
env_default_val = _to_float(env_default, 1.0) if env_default is not None else 1.0 cp = fm.get("chunk_profile")
if cp is None:
cp = defaults.get("chunk_profile")
if cp is None:
cp = os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE", "medium")
cp = _normalize_chunk_profile(cp, "medium")
effective_retriever_weight = _to_float( note_id = _get(parsed_note, "id")
_first_nonempty( note_title = _get(parsed_note, "title")
fm.get("retriever_weight"), body = _get(parsed_note, "body") or _get(parsed_note, "text") or ""
type_defaults.get("retriever_weight"),
env_default_val,
1.0,
),
1.0,
)
effective_chunk_profile = _first_nonempty( items = _get(parsed_note, "chunks") or []
fm.get("chunk_profile"), payloads: List[Dict[str, Any]] = []
fm.get("profile"),
type_defaults.get("chunk_profile"),
os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE"),
)
out: List[Dict[str, Any]] = [] if not items:
for ch in chunks or []: items = [{
payload = _coerce_chunk_dict(ch) # preserve all existing chunk fields "id": f"{note_id}::0" if note_id else None,
payload["retriever_weight"] = effective_retriever_weight "text": body,
if effective_chunk_profile is not None: "order": 0,
payload["chunk_profile"] = effective_chunk_profile "section": None,
out.append(payload) "start": 0,
return out "end": len(body) if isinstance(body, str) else None,
}]
for ch in items:
text = _safe_text(_get(ch, "text"))
if not text:
text = _safe_text(body)
payload = {
"note_id": note_id,
"note_title": note_title,
"type": note_type,
"retriever_weight": float(rw),
"chunk_profile": cp,
"text": text,
"order": _get(ch, "order"),
"section": _get(ch, "section"),
"start": _get(ch, "start"),
"end": _get(ch, "end"),
"chunk_id": _get(ch, "id"),
}
payload = {k: v for k, v in payload.items() if v is not None}
payloads.append(payload)
return payloads

View File

@ -1,201 +1,252 @@
""" """
note_payload.py Mindnet payload helpers note_payload.py Mindnet payload builder (Notes)
Version: 0.5.2 (generated 2025-11-08 21:03:48) Version: 1.3.0 (2025-11-09)
Purpose:
- Build a NOTE payload without dropping existing fields. Purpose
- Resolve and inject: -------
* retriever_weight Build Qdrant-compatible JSON payloads for *notes* from a parsed Markdown
* chunk_profile representation. The function is tolerant to different call signatures and
* edge_defaults accepts both dict-like and object-like "ParsedNote" inputs.
Resolution order:
1) Frontmatter fields Key features
2) Type defaults from a provided registry ('types' kwarg) OR YAML file (types_file kwarg). ------------
YAML formats supported: - Reads type defaults from `config/config.yaml` or `config/types.yaml` (same schema).
- root['types'][note_type]{{retriever_weight, chunk_profile, edge_defaults}} - Resolves fields with the following precedence:
- root[note_type] is the type block directly Frontmatter > type-defaults > ENV > hard-coded fallback.
3) ENV MINDNET_DEFAULT_RETRIEVER_WEIGHT - Ensures only JSON-serializable types are included (no sets, Path, callables).
4) Fallback 1.0 - Sets/normalizes:
Notes: * `type` : note type (e.g., concept, task, experience, project)
- Function signature tolerant: accepts **kwargs (e.g. vault_root, types_file, types, types_registry). * `retriever_weight` : float, influences retrieval blending downstream
- Does NOT attempt to create edges; it only exposes 'edge_defaults' in the NOTE payload for later stages. * `chunk_profile` : short | medium | long (string)
* `edge_defaults` : list[str], used by edge builder outside of this module
- Backwards-compatible signature: accepts **kwargs to swallow unknown args
(e.g., vault_root, prefix, ...).
Expected input (flexible)
-------------------------
`parsed_note` may be:
- dict with keys: id, title, body/text, path, frontmatter (dict), type, ...
- object with attributes: id, title, body/text, path, frontmatter, type, ...
Schema for config files
-----------------------
version: 1.0
types:
concept:
chunk_profile: medium
edge_defaults: ["references", "related_to"]
retriever_weight: 0.33
task:
chunk_profile: short
edge_defaults: ["depends_on", "belongs_to"]
retriever_weight: 0.8
experience:
chunk_profile: medium
edge_defaults: ["derived_from", "inspired_by"]
retriever_weight: 0.9
project:
chunk_profile: long
edge_defaults: ["references", "depends_on"]
retriever_weight: 0.95
""" """
from __future__ import annotations from __future__ import annotations
from typing import Any, Dict, Optional, Mapping, Union
import json
import os import os
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Union
try: try:
import yaml # type: ignore import yaml # type: ignore
except Exception: # pragma: no cover except Exception: # pragma: no cover
yaml = None # will skip YAML loading if unavailable yaml = None # The caller must ensure PyYAML is installed
# ------------------------------
# Helpers
# ------------------------------
# -------- helpers -------- def _get(obj: Any, key: str, default: Any = None) -> Any:
"""Get key from dict-like or attribute from object-like."""
def _coerce_mapping(obj: Any) -> Dict[str, Any]:
if obj is None:
return {{}}
if isinstance(obj, dict): if isinstance(obj, dict):
return dict(obj) return obj.get(key, default)
# try common attributes return getattr(obj, key, default)
out: Dict[str, Any] = {{}}
for k in ("__dict__",):
if hasattr(obj, k):
out.update(getattr(obj, k))
# named attributes we often see
for k in ("id","note_id","title","type","path","source_path","frontmatter"):
if hasattr(obj, k) and k not in out:
out[k] = getattr(obj, k)
return out
def _frontmatter(obj: Any) -> Dict[str, Any]:
fm = _get(obj, "frontmatter", {}) or {}
return fm if isinstance(fm, dict) else {}
def _get_frontmatter(parsed: Mapping[str, Any]) -> Dict[str, Any]: def _coerce_float(val: Any, default: float) -> float:
fm = parsed.get("frontmatter")
if isinstance(fm, dict):
return dict(fm)
return {{}} # tolerate notes without frontmatter
def _load_types_from_yaml(types_file: Optional[Union[str, Path]]) -> Dict[str, Any]:
if types_file is None:
# try common defaults
candidates = [
Path("config/types.yaml"),
Path("config/types.yml"),
Path("config.yaml"),
Path("config.yml"),
]
for p in candidates:
if p.exists():
types_file = p
break
if types_file is None:
return {{}}
p = Path(types_file)
if not p.exists() or yaml is None:
return {{}}
try: try:
data = yaml.safe_load(p.read_text(encoding="utf-8")) if val is None:
if not isinstance(data, dict): return default
return {{}} if isinstance(val, (int, float)):
# support both shapes: {{types: {{concept: ...}}}} OR {{concept: ...}} return float(val)
if "types" in data and isinstance(data["types"], dict): if isinstance(val, str) and val.strip():
return dict(data["types"]) return float(val.strip())
return data
except Exception: except Exception:
return {{}} pass
return default
def _normalize_chunk_profile(val: Any, fallback: str = "medium") -> str:
if not isinstance(val, str):
return fallback
v = val.strip().lower()
if v in {"short", "medium", "long"}:
return v
return fallback
def _resolve_type_defaults(note_type: Optional[str], types: Optional[Dict[str,Any]]) -> Dict[str, Any]: def _coerce_str_list(val: Any) -> List[str]:
defaults = {{}}
if not note_type or not types or not isinstance(types, dict):
return defaults
block = types.get(note_type)
if isinstance(block, dict):
defaults.update(block)
return defaults
def _to_float(val: Any, fallback: float) -> float:
if val is None: if val is None:
return fallback return []
if isinstance(val, list):
out: List[str] = []
for x in val:
if isinstance(x, str):
out.append(x)
else:
out.append(str(x))
return out
if isinstance(val, str):
# allow comma-separated
return [x.strip() for x in val.split(",") if x.strip()]
return []
def _safe_jsonable(value: Any) -> Any:
"""Ensure value is JSON-serializable (no sets, Path, callables, etc.)."""
if isinstance(value, (str, int, float, bool)) or value is None:
return value
if isinstance(value, list):
return [_safe_jsonable(v) for v in value]
if isinstance(value, dict):
return {str(k): _safe_jsonable(v) for k, v in value.items()}
if isinstance(value, Path):
return str(value)
# Avoid sets and other iterables that are not JSON-serializable
try: try:
return float(val) json.dumps(value)
return value
except Exception: except Exception:
return fallback return str(value)
# ------------------------------
# Config loading
# ------------------------------
def _first_nonempty(*vals): def _load_types_config(
for v in vals: explicit_config: Optional[Dict[str, Any]] = None,
if v is not None: search_root: Union[str, Path, None] = None,
if isinstance(v, str) and v.strip() == "": ) -> Dict[str, Any]:
"""
Load types config from:
1) explicit_config (if provided)
2) {search_root}/config/config.yaml
3) {search_root}/config/types.yaml
4) ./config/config.yaml
5) ./config/types.yaml
Returns a dict with shape: {"types": {...}} (empty if none found).
"""
if explicit_config and isinstance(explicit_config, dict):
if "types" in explicit_config and isinstance(explicit_config["types"], dict):
return explicit_config
candidates: List[Path] = []
root = Path(search_root) if search_root else Path.cwd()
candidates.append(root / "config" / "config.yaml")
candidates.append(root / "config" / "types.yaml")
# fallback to CWD when search_root was different
candidates.append(Path.cwd() / "config" / "config.yaml")
candidates.append(Path.cwd() / "config" / "types.yaml")
data = {}
if yaml is None:
return {"types": {}}
for p in candidates:
try:
if p.exists():
with p.open("r", encoding="utf-8") as f:
loaded = yaml.safe_load(f) or {}
if isinstance(loaded, dict) and isinstance(loaded.get("types"), dict):
data = {"types": loaded["types"]}
break
except Exception:
continue
if not data:
data = {"types": {}}
return data
def _type_defaults(note_type: str, cfg: Dict[str, Any]) -> Dict[str, Any]:
return (cfg.get("types") or {}).get(note_type, {}) if isinstance(cfg, dict) else {}
# ------------------------------
# Public API
# ------------------------------
def make_note_payload(
parsed_note: Any,
*,
config: Optional[Dict[str, Any]] = None,
search_root: Union[str, Path, None] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""
Build the payload for a NOTE. Tolerates extra kwargs (e.g., vault_root, prefix).
"""
fm = _frontmatter(parsed_note)
note_type = fm.get("type") or _get(parsed_note, "type") or "concept"
note_type = str(note_type).strip().lower()
# Load config and resolve defaults
cfg = _load_types_config(config, search_root)
defaults = _type_defaults(note_type, cfg)
# retriever_weight: FM > type-defaults > ENV > 1.0
rw = fm.get("retriever_weight")
if rw is None:
rw = defaults.get("retriever_weight")
if rw is None:
env_rw = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT")
rw = _coerce_float(env_rw, 1.0)
else:
rw = _coerce_float(rw, 1.0)
# chunk_profile: FM > type-defaults > ENV > medium
cp = fm.get("chunk_profile")
if cp is None:
cp = defaults.get("chunk_profile")
if cp is None:
cp = os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE", "medium")
cp = _normalize_chunk_profile(cp, "medium")
# edge_defaults: FM > type-defaults > empty
edge_defs = fm.get("edge_defaults")
if edge_defs is None:
edge_defs = defaults.get("edge_defaults", [])
edge_defs = _coerce_str_list(edge_defs)
payload: Dict[str, Any] = {
"id": _get(parsed_note, "id"),
"note_id": _get(parsed_note, "id"),
"title": _get(parsed_note, "title"),
"type": note_type,
"retriever_weight": float(rw),
"chunk_profile": cp,
"edge_defaults": edge_defs,
# Useful passthrough/meta (all made JSON-safe)
"path": _safe_jsonable(_get(parsed_note, "path")),
"source": _safe_jsonable(_get(parsed_note, "source")),
}
# Include raw frontmatter keys (stringify keys; make safe)
if isinstance(fm, dict):
for k, v in fm.items():
# avoid overwriting normalized fields
if k in {"type", "retriever_weight", "chunk_profile", "edge_defaults"}:
continue continue
return v payload[f"fm_{k}"] = _safe_jsonable(v)
return None
# -------- main API --------
def make_note_payload(parsed_note: Any, **kwargs) -> Dict[str, Any]:
parsed = _coerce_mapping(parsed_note)
fm = _get_frontmatter(parsed)
# external sources
types_registry = kwargs.get("types") or kwargs.get("types_registry")
types_from_yaml = _load_types_from_yaml(kwargs.get("types_file"))
# registry wins over YAML if provided
types_all: Dict[str, Any] = types_registry if isinstance(types_registry, dict) else types_from_yaml
note_type: Optional[str] = _first_nonempty(parsed.get("type"), fm.get("type"))
title: Optional[str] = _first_nonempty(parsed.get("title"), fm.get("title"))
note_id: Optional[str] = _first_nonempty(parsed.get("note_id"), parsed.get("id"), fm.get("id"))
type_defaults = _resolve_type_defaults(note_type, types_all)
# --- resolve retriever_weight ---
env_default = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT")
env_default_val = _to_float(env_default, 1.0) if env_default is not None else 1.0
effective_retriever_weight = _to_float(
_first_nonempty(
fm.get("retriever_weight"),
type_defaults.get("retriever_weight"),
env_default_val,
1.0,
),
1.0,
)
# --- resolve chunk_profile ---
effective_chunk_profile = _first_nonempty(
fm.get("chunk_profile"),
fm.get("profile"),
type_defaults.get("chunk_profile"),
os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE"),
)
# --- resolve edge_defaults (list[str]) ---
edge_defaults = _first_nonempty(
fm.get("edge_defaults"),
type_defaults.get("edge_defaults"),
)
if edge_defaults is None:
edge_defaults = []
if isinstance(edge_defaults, str):
# allow "a,b,c"
edge_defaults = [s.strip() for s in edge_defaults.split(",") if s.strip()]
elif not isinstance(edge_defaults, list):
edge_defaults = []
# Start payload by preserving existing parsed keys (shallow copy); DO NOT drop fields
payload: Dict[str, Any] = dict(parsed)
# Ensure canonical top-level fields
if note_id is not None:
payload["id"] = note_id
payload["note_id"] = note_id
if title is not None:
payload["title"] = title
if note_type is not None:
payload["type"] = note_type
payload["retriever_weight"] = effective_retriever_weight
if effective_chunk_profile is not None:
payload["chunk_profile"] = effective_chunk_profile
if edge_defaults:
payload["edge_defaults"] = edge_defaults
# keep frontmatter merged (without duplication)
if "frontmatter" in payload and isinstance(payload["frontmatter"], dict):
fm_out = dict(payload["frontmatter"])
fm_out.setdefault("type", note_type)
fm_out["retriever_weight"] = effective_retriever_weight
if effective_chunk_profile is not None:
fm_out["chunk_profile"] = effective_chunk_profile
if edge_defaults:
fm_out["edge_defaults"] = edge_defaults
payload["frontmatter"] = fm_out
# Remove None values to keep payload clean
payload = {k: v for k, v in payload.items() if v is not None}
return payload return payload