mindnet/app/core/chunk_payload.py
Lars ea211a5c0b
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
Dateien nach "app/core" hochladen
2025-11-11 17:08:34 +01:00

183 lines
6.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
app/core/chunk_payload.py — Mindnet V2 (compat)
Ziele (unveränderte v1-Basis, weniger Duplikate):
- **Kanonicum:** `index`
- **StandardAlias (v2):** `ord` (abschaltbar über ENV MINDNET_CHUNK_INCLUDE_ORD=0)
- **Optionale Aliase:** gesteuert über ENV MINDNET_CHUNK_INDEX_ALIASES
(z.B. "chunk_num,Chunk_Nummer" oder "Chunk_Number"). Standard: kein zusätzlicher Alias.
- Verarbeitet Chunks als Dict **oder** Objekt (Dataclass) und setzt immer `id` (= `chunk_id`)
- Berechnet `neighbors.prev/next`, falls nicht vorhanden
- Denormalisiert Note`tags` auf Chunks
- Akzeptiert `file_path=` als Alias zu `path_arg`
ENV:
- MINDNET_CHUNK_INCLUDE_ORD: "1" (Default) | "0"
- MINDNET_CHUNK_INDEX_ALIASES: CSVListe zulässiger Namen: chunk_num,Chunk_Nummer,Chunk_Number
Hinweis: `edge_defaults` sind NoteRegeln (nicht pro Chunk).
"""
from __future__ import annotations
import json
import os
import pathlib
import hashlib
from typing import Any, Dict, List, Optional
from app.core.chunker import assemble_chunks
# ---------- Helpers ----------
def _as_dict(obj):
if isinstance(obj, dict):
return obj
d = {}
for k in ("index","ord","chunk_index","text","window","id","chunk_id","neighbors","note_id","type","title"):
if hasattr(obj, k):
d[k] = getattr(obj, k)
return d
def _coalesce(*vals):
for v in vals:
if v is not None:
return v
return None
def _env_float(name: str, default: float) -> float:
try:
return float(os.environ.get(name, default))
except Exception:
return default
def _ensure_list(x) -> list:
if x is None: return []
if isinstance(x, list): return [str(i) for i in x]
if isinstance(x, (set, tuple)): return [str(i) for i in x]
return [str(x)]
def _text_from_note(note: Dict[str, Any]) -> str:
return note.get("body") or note.get("text") or ""
def _iter_chunks(note: Dict[str, Any], chunk_profile: str, fulltext: str) -> List[Dict[str, Any]]:
"""Nutze bestehenden assemble_chunks(note_id, body, type). Rückgabe kann Objektliste sein."""
note_id = note.get("id") or (note.get("frontmatter") or {}).get("id")
ntype = (note.get("frontmatter") or {}).get("type") or note.get("type") or "note"
raw_list = assemble_chunks(note_id, fulltext, ntype)
out: List[Dict[str, Any]] = []
for c in raw_list:
out.append(_as_dict(c) if not isinstance(c, dict) else c)
return out
# ---------- Main ----------
def make_chunk_payloads(
note: Any,
path_arg: Optional[str] = None,
chunks_from_chunker: Optional[List[Dict[str, Any]]] = None,
*,
file_path: Optional[str] = None,
note_text: Optional[str] = None,
types_cfg: Optional[dict] = None,
) -> List[Dict[str, Any]]:
n = note if isinstance(note, dict) else {"frontmatter": {}}
fm = n.get("frontmatter") or {}
note_type = str(fm.get("type") or n.get("type") or "note")
types_cfg = types_cfg or {}
cfg_for_type = types_cfg.get(note_type, {}) if isinstance(types_cfg, dict) else {}
default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)
retriever_weight = _coalesce(fm.get("retriever_weight"), cfg_for_type.get("retriever_weight"), default_rw)
try:
retriever_weight = float(retriever_weight)
except Exception:
retriever_weight = default_rw
chunk_profile = _coalesce(fm.get("chunk_profile"), cfg_for_type.get("chunk_profile"), os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE","medium"))
chunk_profile = chunk_profile if isinstance(chunk_profile, str) else "medium"
note_id = n.get("note_id") or n.get("id") or fm.get("id")
title = n.get("title") or fm.get("title") or ""
# Pfad (file_path > note['path'] > path_arg)
path = file_path or n.get("path") or path_arg
if isinstance(path, pathlib.Path):
path = str(path)
path = path or ""
# Tags denormalisieren (optional)
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
tags_list = _ensure_list(tags) if tags else []
# Chunks holen
fulltext = note_text if isinstance(note_text, str) else _text_from_note(n)
raw_chunks = chunks_from_chunker if isinstance(chunks_from_chunker, list) else _iter_chunks(n, chunk_profile, fulltext)
include_ord = (os.environ.get("MINDNET_CHUNK_INCLUDE_ORD", "1") != "0")
alias_csv = os.environ.get("MINDNET_CHUNK_INDEX_ALIASES", "").strip()
extra_aliases = [a.strip() for a in alias_csv.split(",") if a.strip()] if alias_csv else []
payloads: List[Dict[str, Any]] = []
for c in raw_chunks:
cdict = c if isinstance(c, dict) else _as_dict(c)
idx = _coalesce(cdict.get("index"), cdict.get("ord"), cdict.get("chunk_index"), len(payloads))
try:
idx = int(idx)
except Exception:
idx = len(payloads)
text = _coalesce(cdict.get("window"), cdict.get("text"), "")
if not isinstance(text, str):
text = str(text or "")
# deterministische ID
key = f"{note_id}|{idx}"
h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:12] if note_id else hashlib.sha1(f"{path}|{idx}".encode("utf-8")).hexdigest()[:12]
chunk_id = cdict.get("chunk_id") or cdict.get("id") or (f"{note_id}-{idx:03d}-{h}" if note_id else h)
payload = {
"id": chunk_id, # v1 erwartet 'id'
"chunk_id": chunk_id,
"index": idx, # Kanonisch
"note_id": note_id,
"type": note_type,
"title": title,
"path": path,
"text": text,
"window": text,
"retriever_weight": retriever_weight,
"chunk_profile": chunk_profile,
}
if include_ord:
payload["ord"] = idx # v2Standard, abschaltbar
for alias in extra_aliases:
# nur whitelisted Namen zulassen
if alias in ("chunk_num","Chunk_Nummer","Chunk_Number"):
payload[alias] = idx
nb = cdict.get("neighbors")
if isinstance(nb, dict):
prev_id = nb.get("prev"); next_id = nb.get("next")
payload["neighbors"] = {"prev": prev_id, "next": next_id}
if tags_list:
payload["tags"] = tags_list
json.loads(json.dumps(payload, ensure_ascii=False))
payloads.append(payload)
# neighbors berechnen, falls fehlend
for i, p in enumerate(payloads):
nb = p.get("neighbors") or {}
prev_id = nb.get("prev")
next_id = nb.get("next")
if prev_id is None and i > 0:
prev_id = payloads[i-1]["id"]
if next_id is None and i+1 < len(payloads):
next_id = payloads[i+1]["id"]
p["neighbors"] = {"prev": prev_id, "next": next_id}
return payloads