diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index dec7c2b..014f144 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -1,37 +1,150 @@ -from __future__ import annotations -from typing import Dict, List -from .chunker import Chunk -from .parser import extract_wikilinks +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Modul: app/core/chunk_payload.py +Version: 1.1.0 +Datum: 2025-09-09 -def make_chunk_payloads(note_meta: Dict, path: str, chunks: List[Chunk]) -> List[Dict]: - res = [] +Kurzbeschreibung +---------------- +Erzeugt Qdrant-Payloads für Text-Chunks einer Note. Jeder Chunk enthält +den tatsächlichen Text unter dem Schlüssel ``text``. Dadurch kann der +Exporter den vollständigen Body verlässlich aus Chunks rekonstruieren, +falls ``notes.payload.fulltext`` fehlt. + +Wesentliche Features +-------------------- +- Stabile, idempotente Payload-Struktur für Chunks +- Persistenter Chunk-Text (``text``) +- Extraktion von Wikilinks pro Chunk (``wikilinks`` & ``references``) +- Pfadübernahme (relativ zum Vault, wird vom Aufrufer geliefert) +- Bereinigung leerer Felder (keine ``None``/leere Collections im Payload) + +Abhängigkeiten +-------------- +- ``app.core.chunker.Chunk`` (Felder: id, index, char_start, char_end, + token_count, section_title, section_path, neighbors_prev, neighbors_next, text) +- ``app.core.parser.extract_wikilinks`` + +Beispiele (CLI – Sichtprüfung) +------------------------------ + python3 -m app.core.chunk_payload --from-file ./vault/demo.md --print + +Hinweis +------- +Die CLI ist ein Hilfsmittel zur Sichtprüfung. Im Produktpfad ruft der +Importer ``make_chunk_payloads(...)`` direkt auf. +""" + +from __future__ import annotations +from typing import Dict, List, Any +import argparse +import json +import os + +try: + # Projektinterne Imports + from app.core.chunker import Chunk + from app.core.parser import extract_wikilinks, parse_markdown +except Exception: # pragma: no cover - Fallback für relative Ausführung + from .chunker import Chunk # type: ignore + from .parser import extract_wikilinks, parse_markdown # type: ignore + + +# --------------------------------------------------------------------------- +# Utils +# --------------------------------------------------------------------------- + +def _drop_empty(d: Dict[str, Any]) -> Dict[str, Any]: + """Entfernt leere/None-Felder aus einem Dict (für saubere Payloads).""" + return {k: v for k, v in d.items() if v not in (None, [], {}, "")} + + +# --------------------------------------------------------------------------- +# Kernfunktion +# --------------------------------------------------------------------------- + +def make_chunk_payloads(note_meta: Dict[str, Any], path: str, chunks: List[Chunk]) -> List[Dict[str, Any]]: + """ + Baut Payloads für alle ``chunks`` der gegebenen Note. + + Parameters + ---------- + note_meta : Dict[str, Any] + Minimale Metadaten der Note (mind. ``id``, ``title``; optional ``type``, + ``area``, ``project``, ``tags``, ``lang``). + path : str + Relativer Pfad der Note innerhalb des Vaults (z. B. "area/topic/file.md"). + chunks : List[Chunk] + Liste vorsegmentierter Chunks (vgl. app.core.chunker.Chunk). + + Returns + ------- + List[Dict[str, Any]] + Payload-Objekte, bereit für Qdrant-Upserts. + """ + res: List[Dict[str, Any]] = [] for ch in chunks: - wikilinks = extract_wikilinks(ch.text) + wikilinks = extract_wikilinks(getattr(ch, "text", "") or "") payload = { - "id": ch.id, - "note_id": note_meta["id"], - "note_title": note_meta["title"], - "chunk_index": ch.index, - "char_start": ch.char_start, - "char_end": ch.char_end, - "token_count": ch.token_count, - "type": note_meta.get("type"), - "area": note_meta.get("area"), - "project": note_meta.get("project"), - "tags": note_meta.get("tags", []), - "section_title": ch.section_title, - "section_path": ch.section_path, + "id": getattr(ch, "id", None), + "note_id": note_meta.get("id"), + "note_title": note_meta.get("title"), + "chunk_index": getattr(ch, "index", None), + "char_start": getattr(ch, "char_start", None), + "char_end": getattr(ch, "char_end", None), + "token_count": getattr(ch, "token_count", None), + "section_title": getattr(ch, "section_title", None), + "section_path": getattr(ch, "section_path", None), "lang": note_meta.get("lang"), "wikilinks": wikilinks, - "external_links": [], # (optional später ergänzen) + "external_links": [], # kann später ergänzt werden "references": [{"target_id": w, "kind": "wikilink"} for w in wikilinks], "neighbors": { - "prev": ch.neighbors_prev, - "next": ch.neighbors_next + "prev": getattr(ch, "neighbors_prev", None), + "next": getattr(ch, "neighbors_next", None), }, - "path": path, + "path": path, # vom Aufrufer relativ geliefert + "text": getattr(ch, "text", None), # WICHTIG für Export/Rekonstruktion } - # None/Leere bereinigen - payload = {k: v for k, v in payload.items() if v not in (None, [], {})} - res.append(payload) + res.append(_drop_empty(payload)) return res + + +# --------------------------------------------------------------------------- +# CLI – nur zur Sichtprüfung / Debug +# --------------------------------------------------------------------------- + +def _cli() -> None: + ap = argparse.ArgumentParser(description="Vorschau: Chunk-Payloads erzeugen und anzeigen") + ap.add_argument("--from-file", dest="src", required=True, help="Pfad zu einer Markdown-Datei") + ap.add_argument("--print", dest="do_print", action="store_true", help="Payload auf stdout ausgeben") + args = ap.parse_args() + + note = parse_markdown(args.src) + note_meta = { + "id": note.frontmatter.get("id") or note.frontmatter.get("note_id"), + "title": note.frontmatter.get("title"), + "type": note.frontmatter.get("type"), + "area": note.frontmatter.get("area"), + "project": note.frontmatter.get("project"), + "tags": note.frontmatter.get("tags", []), + "lang": note.frontmatter.get("lang"), + } + + # Chunking (benötigt app.core.chunker) + from app.core.chunker import chunk_markdown # lazy import + chunks = chunk_markdown(note.body or "") + + # Vault-Root heuristisch relativieren (nur für Demo) + vault_root = os.path.dirname(os.path.dirname(args.src)) # heuristisch + rel_path = os.path.relpath(args.src, vault_root).replace("\\", "/").lstrip("/") + + payloads = make_chunk_payloads(note_meta, rel_path, chunks) + + if args.do_print: + print(json.dumps(payloads, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": # pragma: no cover + _cli()