mindnet/app/core/chunk_payload.py
Lars c34df96839
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
app/core/chunk_payload.py aktualisiert
2025-09-09 11:15:19 +02:00

144 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Modul: app/core/chunk_payload.py
Version: 1.1.1
Datum: 2025-09-09
Kurzbeschreibung
----------------
Erzeugt Qdrant-Payloads für Text-Chunks einer Note. Jeder Chunk enthält
den tatsächlichen Text unter dem Schlüssel ``text``. Dadurch kann der
Exporter den vollständigen Body verlässlich aus Chunks rekonstruieren,
falls ``notes.payload.fulltext`` fehlt.
Wesentliche Features
--------------------
- Stabile, idempotente Payload-Struktur für Chunks
- Persistenter Chunk-Text (``text``)
- Extraktion von Wikilinks pro Chunk (``wikilinks`` & ``references``)
- Pfadübernahme (relativ zum Vault, wird vom Aufrufer geliefert)
- Bereinigung leerer Felder (keine ``None``/leere Collections im Payload)
Abhängigkeiten
--------------
- ``app.core.chunker.Chunk`` (Felder: id, index, char_start, char_end,
token_count, section_title, section_path, neighbors_prev, neighbors_next, text)
- ``app.core.parser.extract_wikilinks``
Beispiele (CLI Sichtprüfung)
------------------------------
python3 -m app.core.chunk_payload --from-file ./vault/demo.md --print
"""
from __future__ import annotations
from typing import Dict, List, Any
import argparse
import json
import os
try:
# Projektinterne Imports
from app.core.chunker import Chunk, chunk_markdown
from app.core.parser import extract_wikilinks, read_markdown
except Exception: # pragma: no cover - Fallback für relative Ausführung
from .chunker import Chunk, chunk_markdown # type: ignore
from .parser import extract_wikilinks, read_markdown # type: ignore
# ---------------------------------------------------------------------------
# Utils
# ---------------------------------------------------------------------------
def _drop_empty(d: Dict[str, Any]) -> Dict[str, Any]:
"""Entfernt leere/None-Felder aus einem Dict (für saubere Payloads)."""
return {k: v for k, v in d.items() if v not in (None, [], {}, "")}
# ---------------------------------------------------------------------------
# Kernfunktion
# ---------------------------------------------------------------------------
def make_chunk_payloads(note_meta: Dict[str, Any], path: str, chunks: List[Chunk]) -> List[Dict[str, Any]]:
"""
Baut Payloads für alle ``chunks`` der gegebenen Note.
Parameters
----------
note_meta : Dict[str, Any]
Minimale Metadaten der Note (mind. ``id``, ``title``; optional ``type``,
``area``, ``project``, ``tags``, ``lang``).
path : str
Relativer Pfad der Note innerhalb des Vaults (z. B. "area/topic/file.md").
chunks : List[Chunk]
Liste vorsegmentierter Chunks (vgl. app.core.chunker.Chunk).
Returns
-------
List[Dict[str, Any]]
Payload-Objekte, bereit für Qdrant-Upserts.
"""
res: List[Dict[str, Any]] = []
for ch in chunks:
wikilinks = extract_wikilinks(getattr(ch, "text", "") or "")
payload = {
"id": getattr(ch, "id", None),
"note_id": note_meta.get("id"),
"note_title": note_meta.get("title"),
"chunk_index": getattr(ch, "index", None),
"char_start": getattr(ch, "char_start", None),
"char_end": getattr(ch, "char_end", None),
"token_count": getattr(ch, "token_count", None),
"section_title": getattr(ch, "section_title", None),
"section_path": getattr(ch, "section_path", None),
"lang": note_meta.get("lang"),
"wikilinks": wikilinks,
"external_links": [],
"references": [{"target_id": w, "kind": "wikilink"} for w in wikilinks],
"neighbors": {
"prev": getattr(ch, "neighbors_prev", None),
"next": getattr(ch, "neighbors_next", None),
},
"path": path, # vom Aufrufer relativ geliefert
"text": getattr(ch, "text", None), # WICHTIG für Export/Rekonstruktion
}
res.append(_drop_empty(payload))
return res
# ---------------------------------------------------------------------------
# CLI nur zur Sichtprüfung / Debug
# ---------------------------------------------------------------------------
def _cli() -> None:
ap = argparse.ArgumentParser(description="Vorschau: Chunk-Payloads erzeugen und anzeigen")
ap.add_argument("--from-file", dest="src", required=True, help="Pfad zu einer Markdown-Datei")
ap.add_argument("--print", dest="do_print", action="store_true", help="Payload auf stdout ausgeben")
args = ap.parse_args()
note = read_markdown(args.src)
note_meta = {
"id": note.frontmatter.get("id") or note.frontmatter.get("note_id"),
"title": note.frontmatter.get("title"),
"type": note.frontmatter.get("type"),
"area": note.frontmatter.get("area"),
"project": note.frontmatter.get("project"),
"tags": note.frontmatter.get("tags", []),
"lang": note.frontmatter.get("lang"),
}
chunks = chunk_markdown(note.body or "")
# Vault-Root heuristisch relativieren (nur für Demo)
vault_root = os.path.dirname(os.path.dirname(args.src)) # heuristisch
rel_path = os.path.relpath(args.src, vault_root).replace("\\", "/").lstrip("/")
payloads = make_chunk_payloads(note_meta, rel_path, chunks)
if args.do_print:
print(json.dumps(payloads, ensure_ascii=False, indent=2))
if __name__ == "__main__": # pragma: no cover
_cli()