#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Modul: app/core/chunk_payload.py Version: 2.0.0 Datum: 2025-09-09 Kurzbeschreibung ---------------- Erzeugt **Chunk-Payloads** für die Qdrant-Collection `_chunks` auf Basis der vom Chunker gelieferten `Chunk`-Objekte. Ziel: - *Verlustfreie Rekonstruktion*: Jeder Chunk enthält seinen **Text** (payload["text"]). - *Schnelle Abfragen*: Wichtige Note-Metadaten werden gespiegelt, um Filter ohne Join zu erlauben. - *Graph-Kompatibilität*: Wikilinks und externe Links werden extrahiert; Nachbarschaften werden übernommen. - *Monitoring*: Token- und Längenmetriken sowie Text-Hash erleichtern Audits und Re-Embeddings. Kompatibilität -------------- - **Abwärtskompatibel** zur bisherigen `make_chunk_payloads`-Signatur. - Zusätzliche Felder stören bestehende Upserts nicht (Payload ist schema-flexibel). - Erwartet, dass `Chunk` u. a. die Attribute `id`, `index`, `text`, `char_start`, `char_end`, `section_title`, `section_path`, `neighbors_prev`, `neighbors_next` liefert. CLI (Schnelltest) ----------------- # Preview aus einer Markdown-Datei python3 -m app.core.chunk_payload --from-file ./test_vault/20_experiences/exp-two.md --vault-root ./test_vault # Nur IDs & Tokenmengen python3 -m app.core.chunk_payload --from-file ./test_vault/20_experiences/exp-two.md --vault-root ./test_vault --summary Felder (Auszug) --------------- id : "#cNN" scope : "chunk" note_id : "" note_title : str note_type : str note_status : str area, project : optional tags : list[str] note_path : str (relativ, Slashes normalisiert) chunk_index : int section_title : str | None section_path : str | None char_start : int | None char_end : int | None char_len : int token_est : int (≈ len(text)/4) neighbors : {"prev": str|None, "next": str|None} text : str (Chunk-Text, **Pflicht**) text_sha256 : str "sha256:" lang : optional wikilinks : list[str] external_links : list[{"href": str, "label": str|None}] references : list[{"target_id": str, "kind": "wikilink"}] embed_model : optional (durchreichbar) embed_dim : optional embed_version : optional """ from __future__ import annotations import argparse import hashlib import json import os import re from typing import Dict, List, Optional, Tuple try: # Paket-Import (normaler Betrieb) from app.core.chunker import Chunk from app.core.parser import extract_wikilinks, read_markdown, normalize_frontmatter, validate_required_frontmatter except Exception: # pragma: no cover # Relativ (lokale Tests) from .chunker import Chunk # type: ignore from .parser import extract_wikilinks, read_markdown, normalize_frontmatter, validate_required_frontmatter # type: ignore # --------------------------------------------------------------------------- # Utilities # --------------------------------------------------------------------------- RE_MD_LINK = re.compile(r"\[([^\]]*)\]\(([^)\s]+)(?:\s+\"([^\"]+)\")?\)") RE_HTTP_SCHEMES = ("http://", "https://", "mailto:", "obsidian://", "tel:") def _estimate_tokens(text: str) -> int: """Grobe Token-Schätzung (≈ 1 Token pro 4 Zeichen).""" return max(0, int(round(len((text or '').strip()) / 4))) def _sha256_text(text: str) -> str: h = hashlib.sha256() h.update((text or "").encode("utf-8")) return "sha256:" + h.hexdigest() def _normalize_rel_path(path: Optional[str], vault_root: Optional[str]) -> Optional[str]: if not path: return None p = str(path) p = p.replace("\\", "/") if vault_root and os.path.isabs(p): try: p = os.path.relpath(p, vault_root) except Exception: pass p = p.replace("\\", "/").lstrip("/") return p def _extract_external_links(text: str) -> List[Dict[str, Optional[str]]]: """Findet Markdown-Links [label](href "title") mit erlaubten Schemes.""" out: List[Dict[str, Optional[str]]] = [] if not text: return out for m in RE_MD_LINK.finditer(text): label = (m.group(1) or "").strip() or None href = (m.group(2) or "").strip() title = (m.group(3) or "").strip() or None if any(href.startswith(s) for s in RE_HTTP_SCHEMES): out.append({"href": href, "label": label or title}) return out # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def make_chunk_payloads(note_meta: Dict, path: str, chunks: List[Chunk]) -> List[Dict]: """ Erzeugt Payload-Dicts für alle Chunks einer Note. Parameters ---------- note_meta : Dict Normalisierte Frontmatter der Note (mind.: id, title, type, status, tags, [area, project, lang]). path : str Pfad zur Note (sollte relativ zum Vault sein; wird hier zur Sicherheit normalisiert). chunks : List[Chunk] Vom Chunker erzeugte Chunks. Returns ------- List[Dict] Liste von Payloads (ein Eintrag pro Chunk). """ res: List[Dict] = [] rel_path = _normalize_rel_path(path, vault_root=None) for ch in chunks: text: str = getattr(ch, "text", "") or "" wikilinks = extract_wikilinks(text) ext_links = _extract_external_links(text) payload: Dict = { "id": getattr(ch, "id", None), "scope": "chunk", "note_id": note_meta.get("id"), "note_title": note_meta.get("title"), # gespiegelt für schnelle Filter: "note_type": note_meta.get("type"), "note_status": note_meta.get("status"), "area": note_meta.get("area"), "project": note_meta.get("project"), "tags": note_meta.get("tags"), # Pfad "note_path": rel_path, "path": rel_path, # Back-compat # Reihenfolge & Section "chunk_index": getattr(ch, "index", None), "section_title": getattr(ch, "section_title", None), "section_path": getattr(ch, "section_path", None), # Position "char_start": getattr(ch, "char_start", None), "char_end": getattr(ch, "char_end", None), "char_len": max(0, int(getattr(ch, "char_end", 0) or 0) - int(getattr(ch, "char_start", 0) or 0)) or len(text), # Nachbarn "neighbors": { "prev": getattr(ch, "neighbors_prev", None), "next": getattr(ch, "neighbors_next", None), }, # Inhalt & Metrik "text": text, "text_sha256": _sha256_text(text), "token_est": _estimate_tokens(text), # Sprache "lang": note_meta.get("lang"), # Links "wikilinks": wikilinks, "external_links": ext_links, "references": [{"target_id": w, "kind": "wikilink"} for w in wikilinks], } # Entferne Nones/Leeres, aber **text** bleibt (darf leer sein z. B. bei Bild-Only-Chunks) cleaned = {} for k, v in payload.items(): if v in (None, [], {}): # immer behalten: if k in ("text", "neighbors"): cleaned[k] = v else: continue else: cleaned[k] = v res.append(cleaned) return res # --------------------------------------------------------------------------- # CLI zum schnellen Testen # --------------------------------------------------------------------------- def _cli() -> None: ap = argparse.ArgumentParser(description="Chunk-Payloads aus einer einzelnen Markdown-Datei erzeugen") ap.add_argument("--from-file", required=True, help="Pfad zur Markdown-Datei") ap.add_argument("--vault-root", default=None, help="Vault-Wurzel (zur Pfad-Relativierung)") ap.add_argument("--summary", action="store_true", help="Nur kurze Übersicht je Chunk ausgeben") args = ap.parse_args() parsed = read_markdown(args.from_file) fm = normalize_frontmatter(parsed.frontmatter) validate_required_frontmatter(fm) # lazy import, um Zyklen zu vermeiden try: from app.core.chunker import assemble_chunks except Exception: from .chunker import assemble_chunks # type: ignore chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) rel = _normalize_rel_path(parsed.path, args.vault_root) pls = make_chunk_payloads(fm, rel or parsed.path, chunks) if args.summary: out = [] for p in pls: out.append({ "id": p.get("id"), "chunk_index": p.get("chunk_index"), "token_est": p.get("token_est"), "wikilinks": p.get("wikilinks"), "ext_links": [e.get("href") for e in p.get("external_links", [])], "prev": (p.get("neighbors") or {}).get("prev"), "next": (p.get("neighbors") or {}).get("next"), }) print(json.dumps(out, ensure_ascii=False, indent=2)) else: print(json.dumps(pls, ensure_ascii=False, indent=2)) if __name__ == "__main__": # pragma: no cover _cli()