Dateien nach "app/core" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s

This commit is contained in:
Lars 2025-11-16 18:51:06 +01:00
parent c7644a36aa
commit f18a40d76c

View File

@ -1,182 +1,161 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
app/core/chunk_payload.py Mindnet V2 (compat) app/core/chunk_payload.py (Mindnet V2 robust)
Ziele (unveränderte v1-Basis, weniger Duplikate): Aufgabe
- **Kanonicum:** `index` -------
- **StandardAlias (v2):** `ord` (abschaltbar über ENV MINDNET_CHUNK_INCLUDE_ORD=0) Erzeugt Chunk-Payloads aus den vom Chunker gelieferten "Chunk"-Objekten.
- **Optionale Aliase:** gesteuert über ENV MINDNET_CHUNK_INDEX_ALIASES - Spiegelt `retriever_weight` und `chunk_profile` in **jedem** Chunk-Payload.
(z.B. "chunk_num,Chunk_Nummer" oder "Chunk_Number"). Standard: kein zusätzlicher Alias. - Werteauflösung: Frontmatter > types.yaml > Defaults.
- Verarbeitet Chunks als Dict **oder** Objekt (Dataclass) und setzt immer `id` (= `chunk_id`) - Lädt `config/types.yaml` selbst, wenn `types_cfg` nicht übergeben wurde.
- Berechnet `neighbors.prev/next`, falls nicht vorhanden
- Denormalisiert Note`tags` auf Chunks
- Akzeptiert `file_path=` als Alias zu `path_arg`
ENV: Eingang
- MINDNET_CHUNK_INCLUDE_ORD: "1" (Default) | "0" -------
- MINDNET_CHUNK_INDEX_ALIASES: CSVListe zulässiger Namen: chunk_num,Chunk_Nummer,Chunk_Number - note: Dict mit mind. { frontmatter: {...}, id, type, title, path }
- note_path: Pfad der Note (für Payload-Feld `path`)
- chunks_from_chunker: Liste von Objekten mit Attributen/Feldern:
id, note_id, index, text, window, neighbors_prev, neighbors_next
- note_text: voller Text der Note (optional, kann leer sein)
- types_cfg: optional; wenn None config wird intern geladen
- file_path: optional, für Debug/Tracing im Payload
Ausgang (pro Chunk)
-------------------
- Pflichtfelder: note_id, chunk_id, index (0-basiert), ord (1-basiert), type, tags
- Texte: text, window
- Nachbarn: neighbors_prev, neighbors_next
- Spiegelungen: retriever_weight, chunk_profile
- Meta: source_path, path, section (leer), created/updated opt. aus Frontmatter
Hinweis: `edge_defaults` sind NoteRegeln (nicht pro Chunk).
""" """
from __future__ import annotations from __future__ import annotations
import json
import os
import pathlib
import hashlib
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import os, yaml
from app.core.chunker import assemble_chunks def _env(n: str, d: Optional[str]=None) -> str:
v = os.getenv(n)
return v if v is not None else (d or "")
# ---------- Helpers ---------- def _deep_get(root: Any, path: str) -> Any:
cur = root
for key in path.split("."):
if not isinstance(cur, dict) or key not in cur:
return None
cur = cur[key]
return cur
def _as_dict(obj): def _as_float(x: Any):
if isinstance(obj, dict): try:
return obj return float(x)
d = {} except Exception:
for k in ("index","ord","chunk_index","text","window","id","chunk_id","neighbors","note_id","type","title"):
if hasattr(obj, k):
d[k] = getattr(obj, k)
return d
def _coalesce(*vals):
for v in vals:
if v is not None:
return v
return None return None
def _env_float(name: str, default: float) -> float: def _load_types_local() -> dict:
p = _env("MINDNET_TYPES_FILE", "./config/types.yaml")
try: try:
return float(os.environ.get(name, default)) with open(p, "r", encoding="utf-8") as f:
return yaml.safe_load(f) or {}
except Exception: except Exception:
return default return {}
def _ensure_list(x) -> list: def _effective_chunk_profile(note_type: str, fm: Dict[str, Any], reg: dict) -> Optional[str]:
if x is None: return [] # Frontmatter zuerst
if isinstance(x, list): return [str(i) for i in x] if isinstance(fm.get("chunk_profile"), str):
if isinstance(x, (set, tuple)): return [str(i) for i in x] return fm.get("chunk_profile")
return [str(x)] # Registry
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
if isinstance(types, dict):
v = types.get(note_type, {})
if isinstance(v, dict):
cp = v.get("chunk_profile")
if isinstance(cp, str):
return cp
return None
def _text_from_note(note: Dict[str, Any]) -> str: def _effective_retriever_weight(note_type: str, fm: Dict[str, Any], reg: dict) -> float:
return note.get("body") or note.get("text") or "" # Frontmatter zuerst
if fm.get("retriever_weight") is not None:
v = _as_float(fm.get("retriever_weight"))
if v is not None:
return float(v)
# Registry-Pfade
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
candidates = [
f"{note_type}.retriever_weight",
f"{note_type}.retriever.weight",
f"{note_type}.retrieval.weight",
"defaults.retriever_weight",
"defaults.retriever.weight",
"global.retriever_weight",
"global.retriever.weight",
]
for path in candidates:
# Wenn types == reg-root (flatten), erlauben sowohl "types.X" als auch "X"
val = _deep_get(types, path) if "." in path else (types.get(path) if isinstance(types, dict) else None)
if val is None and isinstance(reg, dict):
# versuche absolute Pfade
val = _deep_get(reg, f"types.{path}")
v = _as_float(val)
if v is not None:
return float(v)
return 1.0
def _iter_chunks(note: Dict[str, Any], chunk_profile: str, fulltext: str) -> List[Dict[str, Any]]: def make_chunk_payloads(note: Dict[str, Any],
"""Nutze bestehenden assemble_chunks(note_id, body, type). Rückgabe kann Objektliste sein.""" note_path: str,
note_id = note.get("id") or (note.get("frontmatter") or {}).get("id") chunks_from_chunker: List[Any],
ntype = (note.get("frontmatter") or {}).get("type") or note.get("type") or "note"
raw_list = assemble_chunks(note_id, fulltext, ntype)
out: List[Dict[str, Any]] = []
for c in raw_list:
out.append(_as_dict(c) if not isinstance(c, dict) else c)
return out
# ---------- Main ----------
def make_chunk_payloads(
note: Any,
path_arg: Optional[str] = None,
chunks_from_chunker: Optional[List[Dict[str, Any]]] = None,
*, *,
file_path: Optional[str] = None, note_text: str = "",
note_text: Optional[str] = None,
types_cfg: Optional[dict] = None, types_cfg: Optional[dict] = None,
) -> List[Dict[str, Any]]: file_path: Optional[str] = None) -> List[Dict[str, Any]]:
n = note if isinstance(note, dict) else {"frontmatter": {}} fm = (note or {}).get("frontmatter", {})
fm = n.get("frontmatter") or {} note_type = fm.get("type") or note.get("type") or "concept"
note_type = str(fm.get("type") or n.get("type") or "note") reg = types_cfg if isinstance(types_cfg, dict) else _load_types_local()
types_cfg = types_cfg or {}
cfg_for_type = types_cfg.get(note_type, {}) if isinstance(types_cfg, dict) else {}
default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0) # Effektive Werte bestimmen
retriever_weight = _coalesce(fm.get("retriever_weight"), cfg_for_type.get("retriever_weight"), default_rw) cp = _effective_chunk_profile(note_type, fm, reg)
try: rw = _effective_retriever_weight(note_type, fm, reg)
retriever_weight = float(retriever_weight)
except Exception:
retriever_weight = default_rw
chunk_profile = _coalesce(fm.get("chunk_profile"), cfg_for_type.get("chunk_profile"), os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE","medium")) tags = fm.get("tags") or []
chunk_profile = chunk_profile if isinstance(chunk_profile, str) else "medium" if isinstance(tags, str):
tags = [tags]
note_id = n.get("note_id") or n.get("id") or fm.get("id") out: List[Dict[str, Any]] = []
title = n.get("title") or fm.get("title") or ""
# Pfad (file_path > note['path'] > path_arg) for idx, ch in enumerate(chunks_from_chunker):
path = file_path or n.get("path") or path_arg # Chunk-Grunddaten (Attribute oder Keys)
if isinstance(path, pathlib.Path): cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None)
path = str(path) nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id"))
path = path or "" index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx)
text = getattr(ch, "text", None) or (ch.get("text") if isinstance(ch, dict) else "")
window = getattr(ch, "window", None) or (ch.get("window") if isinstance(ch, dict) else text)
prev_id = getattr(ch, "neighbors_prev", None) or (ch.get("neighbors_prev") if isinstance(ch, dict) else None)
next_id = getattr(ch, "neighbors_next", None) or (ch.get("neighbors_next") if isinstance(ch, dict) else None)
# Tags denormalisieren (optional) pl: Dict[str, Any] = {
tags = fm.get("tags") or fm.get("keywords") or n.get("tags") "note_id": nid,
tags_list = _ensure_list(tags) if tags else [] "chunk_id": cid,
"index": int(index),
# Chunks holen "ord": int(index) + 1,
fulltext = note_text if isinstance(note_text, str) else _text_from_note(n)
raw_chunks = chunks_from_chunker if isinstance(chunks_from_chunker, list) else _iter_chunks(n, chunk_profile, fulltext)
include_ord = (os.environ.get("MINDNET_CHUNK_INCLUDE_ORD", "1") != "0")
alias_csv = os.environ.get("MINDNET_CHUNK_INDEX_ALIASES", "").strip()
extra_aliases = [a.strip() for a in alias_csv.split(",") if a.strip()] if alias_csv else []
payloads: List[Dict[str, Any]] = []
for c in raw_chunks:
cdict = c if isinstance(c, dict) else _as_dict(c)
idx = _coalesce(cdict.get("index"), cdict.get("ord"), cdict.get("chunk_index"), len(payloads))
try:
idx = int(idx)
except Exception:
idx = len(payloads)
text = _coalesce(cdict.get("window"), cdict.get("text"), "")
if not isinstance(text, str):
text = str(text or "")
# deterministische ID
key = f"{note_id}|{idx}"
h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:12] if note_id else hashlib.sha1(f"{path}|{idx}".encode("utf-8")).hexdigest()[:12]
chunk_id = cdict.get("chunk_id") or cdict.get("id") or (f"{note_id}-{idx:03d}-{h}" if note_id else h)
payload = {
"id": chunk_id, # v1 erwartet 'id'
"chunk_id": chunk_id,
"index": idx, # Kanonisch
"note_id": note_id,
"type": note_type, "type": note_type,
"title": title, "tags": tags,
"path": path,
"text": text, "text": text,
"window": text, "window": window,
"retriever_weight": retriever_weight, "neighbors_prev": prev_id,
"chunk_profile": chunk_profile, "neighbors_next": next_id,
"section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""),
"path": note_path,
"source_path": file_path or note_path,
"retriever_weight": float(rw),
} }
if include_ord: if cp is not None:
payload["ord"] = idx # v2Standard, abschaltbar pl["chunk_profile"] = cp
for alias in extra_aliases:
# nur whitelisted Namen zulassen
if alias in ("chunk_num","Chunk_Nummer","Chunk_Number"):
payload[alias] = idx
nb = cdict.get("neighbors") # Aufräumen: keine historischen Aliasfelder
if isinstance(nb, dict): for alias in ("chunk_num", "Chunk_Number"):
prev_id = nb.get("prev"); next_id = nb.get("next") if alias in pl:
payload["neighbors"] = {"prev": prev_id, "next": next_id} pl.pop(alias, None)
if tags_list: out.append(pl)
payload["tags"] = tags_list
json.loads(json.dumps(payload, ensure_ascii=False)) return out
payloads.append(payload)
# neighbors berechnen, falls fehlend
for i, p in enumerate(payloads):
nb = p.get("neighbors") or {}
prev_id = nb.get("prev")
next_id = nb.get("next")
if prev_id is None and i > 0:
prev_id = payloads[i-1]["id"]
if next_id is None and i+1 < len(payloads):
next_id = payloads[i+1]["id"]
p["neighbors"] = {"prev": prev_id, "next": next_id}
return payloads