mindnet/app/core/chunk_payload.py
Lars 9e8b433c95
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
Dateien nach "app/core" hochladen
2025-11-11 17:04:53 +01:00

183 lines
6.8 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
app/core/chunk_payload.py — Mindnet V2 (compat)
Ziele (ohne Bruch zur lauffähigen v1-Basis):
- Akzeptiert `file_path=` (Alias zu path_arg)
- Verarbeitet Chunks sowohl als `dict` **als auch** als Objekt (z.B. Dataclass `Chunk`)
- Schreibt v1-kompatible Felder:
* `id` (Alias von `chunk_id` **wichtig** für app/core/edges.py v1)
* `neighbors: {prev, next}` wird **berechnet** (Sequenz), falls nicht vorhanden
- Denormalisiert optional `tags` der Note auf Chunks
- Fügt Nummern-Aliase hinzu: `ord`, `chunk_num`, `Chunk_Nummer`
Wichtig:
- `edge_defaults` gehören zur *Note* (Typ-Regeln), nicht pro Chunk. Werden hier **nicht** gespiegelt.
"""
from __future__ import annotations
import json
import os
import pathlib
import hashlib
from typing import Any, Dict, List, Optional
from app.core.chunker import assemble_chunks
# ---------- Helpers ----------
def _as_dict(obj):
if isinstance(obj, dict):
return obj
# Objekt → (teilweise) Dict-Ansicht via Attribute
d = {}
for k in ("index","ord","chunk_index","text","window","id","chunk_id","neighbors","note_id","type","title"):
if hasattr(obj, k):
d[k] = getattr(obj, k)
# Fallback: bestehe nicht auf Vollständigkeit
return d
def _coalesce(*vals):
for v in vals:
if v is not None:
return v
return None
def _env_float(name: str, default: float) -> float:
try:
return float(os.environ.get(name, default))
except Exception:
return default
def _ensure_list(x) -> list:
if x is None: return []
if isinstance(x, list): return [str(i) for i in x]
if isinstance(x, (set, tuple)): return [str(i) for i in x]
return [str(x)]
def _text_from_note(note: Dict[str, Any]) -> str:
return note.get("body") or note.get("text") or ""
def _iter_chunks(note: Dict[str, Any], chunk_profile: str, fulltext: str) -> List[Dict[str, Any]]:
"""Nutze bestehenden assemble_chunks(note_id, body, type). Rückgabe kann Objektliste sein."""
note_id = note.get("id") or (note.get("frontmatter") or {}).get("id")
ntype = (note.get("frontmatter") or {}).get("type") or note.get("type") or "note"
raw_list = assemble_chunks(note_id, fulltext, ntype)
# Normalisiere auf Dicts (unter Bewahrung vorhandener Keys)
out: List[Dict[str, Any]] = []
for c in raw_list:
out.append(_as_dict(c) if not isinstance(c, dict) else c)
return out
# ---------- Main ----------
def make_chunk_payloads(
note: Any,
path_arg: Optional[str] = None,
chunks_from_chunker: Optional[List[Dict[str, Any]]] = None,
*,
file_path: Optional[str] = None,
note_text: Optional[str] = None,
types_cfg: Optional[dict] = None,
) -> List[Dict[str, Any]]:
"""
Erzeugt Chunk-Payloads im v1-kompatiblen Format (plus V2-Aliase).
"""
# ---- Note-Kontext ----
n = note if isinstance(note, dict) else {"frontmatter": {}}
fm = n.get("frontmatter") or {}
note_type = str(fm.get("type") or n.get("type") or "note")
types_cfg = types_cfg or {}
cfg_for_type = types_cfg.get(note_type, {}) if isinstance(types_cfg, dict) else {}
default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)
retriever_weight = _coalesce(fm.get("retriever_weight"), cfg_for_type.get("retriever_weight"), default_rw)
try:
retriever_weight = float(retriever_weight)
except Exception:
retriever_weight = default_rw
chunk_profile = _coalesce(fm.get("chunk_profile"), cfg_for_type.get("chunk_profile"), os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE","medium"))
chunk_profile = chunk_profile if isinstance(chunk_profile, str) else "medium"
note_id = n.get("note_id") or n.get("id") or fm.get("id")
title = n.get("title") or fm.get("title") or ""
# Pfadauflösung: file_path > note['path'] > path_arg
path = file_path or n.get("path") or path_arg
if isinstance(path, pathlib.Path):
path = str(path)
path = path or ""
# Tags denormalisieren (optional)
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
tags_list = _ensure_list(tags) if tags else []
# ---- Chunks besorgen ----
fulltext = note_text if isinstance(note_text, str) else _text_from_note(n)
raw_chunks = chunks_from_chunker if isinstance(chunks_from_chunker, list) else _iter_chunks(n, chunk_profile, fulltext)
payloads: List[Dict[str, Any]] = []
for c in raw_chunks:
cdict = c if isinstance(c, dict) else _as_dict(c)
# Index/Basisdaten robust ermitteln
idx = _coalesce(cdict.get("index"), cdict.get("ord"), cdict.get("chunk_index"), len(payloads))
try:
idx = int(idx)
except Exception:
idx = len(payloads)
text = _coalesce(cdict.get("window"), cdict.get("text"), "")
if not isinstance(text, str):
text = str(text or "")
# deterministische ID (kompatibel & stabil)
key = f"{note_id}|{idx}"
h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:12] if note_id else hashlib.sha1(f"{path}|{idx}".encode("utf-8")).hexdigest()[:12]
chunk_id = cdict.get("chunk_id") or cdict.get("id") or (f"{note_id}-{idx:03d}-{h}" if note_id else h)
payload = {
# v1 Kernfelder (+Erweiterungen)
"id": chunk_id, # <— WICHTIG: v1 edges.py erwartet 'id'
"chunk_id": chunk_id, # v2-Alias
"index": idx,
"ord": idx, # v2-Alias
"chunk_num": idx,
"Chunk_Nummer": idx,
"note_id": note_id,
"type": note_type,
"title": title,
"path": path,
"text": text,
"window": text, # falls der Chunker bereits ein Fenster liefert, bleibt es identisch
"retriever_weight": retriever_weight,
"chunk_profile": chunk_profile,
}
# Bestehende neighbors vom Chunk übernehmen (falls vorhanden)
nb = cdict.get("neighbors")
if isinstance(nb, dict):
prev_id = nb.get("prev"); next_id = nb.get("next")
payload["neighbors"] = {"prev": prev_id, "next": next_id}
# Tags spiegeln
if tags_list:
payload["tags"] = tags_list
# JSON-Roundtrip als Validierung
json.loads(json.dumps(payload, ensure_ascii=False))
payloads.append(payload)
# Nachgelagert: neighbors berechnen, falls fehlend
for i, p in enumerate(payloads):
nb = p.get("neighbors") or {}
prev_id = nb.get("prev")
next_id = nb.get("next")
if prev_id is None and i > 0:
prev_id = payloads[i-1]["id"]
if next_id is None and i+1 < len(payloads):
next_id = payloads[i+1]["id"]
p["neighbors"] = {"prev": prev_id, "next": next_id}
return payloads