mindnet/app/core/chunk_payload.py
Lars 2a1c62aeed
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
Dateien nach "app/core" hochladen
2025-11-11 17:01:19 +01:00

153 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
app/core/chunk_payload.py — Mindnet V2 (compat)
Ziele:
- Bewahrt bestehendes Verhalten (index, chunk_profile, retriever_weight, etc.)
- Denormalisiert optional `tags` aus der NoteFM auf Chunks
- Fügt Aliase für die ChunkNummer hinzu: `ord` (v2Schema), `chunk_num`, `Chunk_Nummer`
- **Kompatibilität:** akzeptiert sowohl `path_arg` (positional) als auch `file_path` (keyword)
Hinweis:
- `edge_defaults` sind NoteRegeln (Typ) und werden nicht pro Chunk gespiegelt.
"""
from __future__ import annotations
import json
import os
import pathlib
import hashlib
from typing import Any, Dict, List, Optional
from app.core.chunker import assemble_chunks
def _as_dict(obj):
if isinstance(obj, dict): return obj
try:
return dict(obj) # type: ignore
except Exception:
return {"value": obj}
def _coalesce(*vals):
for v in vals:
if v is not None:
return v
return None
def _env_float(name: str, default: float) -> float:
try:
return float(os.environ.get(name, default))
except Exception:
return default
def _ensure_list(x) -> list:
if x is None: return []
if isinstance(x, list): return [str(i) for i in x]
if isinstance(x, (set, tuple)): return [str(i) for i in x]
return [str(x)]
def _load_types_config(types_cfg_explicit: Optional[dict] = None) -> dict:
"""Types-Registry *optional* einspeisen (bereits geparst), sonst lazy-laden vermeiden."""
return types_cfg_explicit or {}
def _text_from_note(note: Dict[str, Any]) -> str:
# Erwartete Inputs (siehe parser.py / import_markdown.py):
# note["body"] oder note["text"]; Fallback leerer String
return note.get("body") or note.get("text") or ""
def _iter_chunks(note: Dict[str, Any], chunk_profile: str, fulltext: str) -> List[Dict[str, Any]]:
"""Nutze bestehenden assemble_chunks(note_id, body, type)."""
note_id = note.get("id") or (note.get("frontmatter") or {}).get("id")
ntype = (note.get("frontmatter") or {}).get("type") or note.get("type") or "note"
# assemble_chunks liefert Liste von Dicts mit mindestens {"index","text"} (v1)
return assemble_chunks(note_id, fulltext, ntype)
def make_chunk_payloads(
note: Any,
path_arg: Optional[str] = None,
chunks_from_chunker: Optional[List[Dict[str, Any]]] = None,
*,
file_path: Optional[str] = None,
note_text: Optional[str] = None,
types_cfg: Optional[dict] = None,
) -> List[Dict[str, Any]]:
"""
Erzeugt Chunk-Payloads. Erwartet:
- `note`: Normalisierte Note-Struktur (inkl. frontmatter)
- `path_arg` oder `file_path`: Pfad der Note
- `chunks_from_chunker`: optional: Ergebnis von assemble_chunks (sonst wird intern erzeugt)
Rückgabe: Liste aus Payload-Dicts, jedes mit mind.:
- note_id, chunk_id, index, ord (Alias), title, type, path, text, retriever_weight, chunk_profile
- optional: tags (aus Note-FM), chunk_num, Chunk_Nummer (Aliases von index/ord)
"""
n = _as_dict(note)
fm = n.get("frontmatter") or {}
note_type = str(fm.get("type") or n.get("type") or "note")
types_cfg = _load_types_config(types_cfg)
cfg_for_type = types_cfg.get(note_type, {}) if isinstance(types_cfg, dict) else {}
default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)
retriever_weight = _coalesce(fm.get("retriever_weight"), cfg_for_type.get("retriever_weight"), default_rw)
try:
retriever_weight = float(retriever_weight)
except Exception:
retriever_weight = default_rw
chunk_profile = _coalesce(fm.get("chunk_profile"), cfg_for_type.get("chunk_profile"), os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE","medium"))
chunk_profile = chunk_profile if isinstance(chunk_profile, str) else "medium"
note_id = n.get("note_id") or n.get("id") or fm.get("id")
title = n.get("title") or fm.get("title") or ""
# Pfad-Auflösung: Priorität file_path > note['path'] > path_arg
path = file_path or n.get("path") or path_arg
if isinstance(path, pathlib.Path):
path = str(path)
path = path or "" # garantiert vorhanden
# Denormalisierte Tags (optional): auf Chunks spiegeln, wenn vorhanden
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
tags_list = _ensure_list(tags) if tags else []
# Quelltext
fulltext = note_text if isinstance(note_text, str) else _text_from_note(n)
# Chunks besorgen
chunks = chunks_from_chunker if isinstance(chunks_from_chunker, list) else _iter_chunks(n, chunk_profile, fulltext)
payloads: List[Dict[str, Any]] = []
for c in chunks:
idx = c.get("index", len(payloads))
text = c.get("text") if isinstance(c, dict) else (str(c) if c is not None else "")
text = text if isinstance(text, str) else str(text or "")
# deterministische ID (unter Beibehaltung deines bisherigen Schemas)
key = f"{note_id}|{idx}"
h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:12] if note_id else hashlib.sha1(f"{path}|{idx}".encode("utf-8")).hexdigest()[:12]
chunk_id = f"{note_id}-{idx:03d}-{h}" if note_id else f"{h}"
payload = {
"note_id": note_id,
"chunk_id": chunk_id,
"index": idx,
"ord": idx, # Alias für v2Schema
"chunk_num": idx, # neutraler Alias
"Chunk_Nummer": idx, # deutschsprachiger Alias
"title": title,
"type": note_type,
"path": path, # garantiert vorhanden
"text": text, # nie leer
"retriever_weight": retriever_weight,
"chunk_profile": chunk_profile,
}
if tags_list:
payload["tags"] = tags_list
# JSONRoundtrip als einfache Validierung
json.loads(json.dumps(payload, ensure_ascii=False))
payloads.append(payload)
return payloads