diff --git a/app/core/note_payload.py b/app/core/note_payload.py index 94cb846..10f4409 100644 --- a/app/core/note_payload.py +++ b/app/core/note_payload.py @@ -7,7 +7,310 @@ Datum: 2025-09-09 Kurzbeschreibung ---------------- -Erzeugt den Qdrant-Payload für **Notes**. Neben stabiler Hash-Bildung zur +Erzeugt den Qdrant-Payload für **Notes**. Neben stabiler Hash-Bildung zur#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Modul: app/core/note_payload.py +Version: 1.6.0 +Datum: 2025-09-09 + +Kurzbeschreibung +---------------- +Erzeugt den Qdrant-Payload für **Notes** inkl. robuster Hash-Bildung +zur Änderungserkennung. Der vollständige Body wird unter ``fulltext`` persistiert; +der Pfad ist relativ (für verlustfreien Export). + +Wichtig +------- +- **Nur Inhalte** fließen in den Hash ein (keine FS-Zeitstempel). +- Vergleichsarten: + * Body → nur Body + * Frontmatter → nur Frontmatter + * Full → Body + Frontmatter + per CLI/ENV: ``--hash-mode`` oder ENV ``MINDNET_HASH_MODE``/``MINDNET_HASH_COMPARE``. +- Hash-Quelle: + * parsed (Default) → nutzt den vom Parser gelieferten Body + * raw → liest Rohdatei und entfernt Frontmatter via Regex + per CLI/ENV: ``--hash-source`` oder ENV ``MINDNET_HASH_SOURCE``. +- Normalisierung: + * canonical (Default) → \r\n→\n, Zeilenend-Whitespace entfernt + * none → keine Normalisierung (erkennt jede Kleinigkeit) + per CLI/ENV: ``--hash-normalize`` oder ENV ``MINDNET_HASH_NORMALIZE``. + +Neu in v1.6.0 +------------- +- ``hash_signature`` im Payload, z. B. "body:raw:none:". +- Optionales Mitspeichern eines "Hash-Sets", wenn ENV ``MINDNET_HASH_RECORD_ALL=true``: + payload["hashes"] = { + "body_parsed": "...", "body_raw": "...", + "frontmatter": "...", + "full_parsed": "...", "full_raw": "..." + } + +CLI (Sichtprüfung) +------------------ + python3 -m app.core.note_payload --from-file ./vault/demo.md --vault-root ./vault --print --hash-mode full --hash-source raw --hash-normalize none +""" +from __future__ import annotations + +import argparse +import hashlib +import json +import os +from typing import Any, Dict, Optional, Tuple + +try: + from app.core.parser import read_markdown, extract_wikilinks, FRONTMATTER_RE +except Exception: # pragma: no cover + from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _canon_frontmatter(fm: Dict[str, Any]) -> str: + """Kanonische, stabile JSON-Serialisierung der Frontmatter für Hashbildung.""" + return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True) + +def _normalize_body(body: str, mode: str) -> str: + """Normalisiert den Body für reproduzierbare Hashes (oder nicht).""" + if mode == "none": + return body if body is not None else "" + # canonical: \r\n→\n, trailing spaces am Zeilenende entfernen + text = (body or "").replace("\r\n", "\n").replace("\r", "\n") + text = "\n".join(line.rstrip() for line in text.split("\n")) + return text + +def _resolve_hash_mode(explicit: Optional[str]) -> str: + """ + Normalisiert den Hash-Modus auf: + 'body' | 'frontmatter' | 'body+frontmatter' + Akzeptiert auch 'full' (Alias). + Beachtet zusätzlich ENV: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE. + """ + if explicit: + val = explicit.strip().lower() + else: + val = (os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower() + if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"): + return "body+frontmatter" + if val in ("frontmatter", "fm"): + return "frontmatter" + return "body" + +def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]: + """Liest die Rohdatei und extrahiert Body & Frontmatter ohne Parser-Logik. + + Rückgabe: + (body_text, frontmatter_dict) + """ + if not file_path or not os.path.exists(file_path): + return "", {} + try: + with open(file_path, "r", encoding="utf-8") as f: + raw = f.read() + except Exception: + return "", {} + # Frontmatter per Regex entfernen + m = FRONTMATTER_RE.match(raw) + fm = {} + if m: + fm_txt = m.group(1) + try: + import yaml # lazy + fm = yaml.safe_load(fm_txt) or {} + except Exception: + fm = {} + body = raw[m.end():] + else: + body = raw + return body, fm + + +# --------------------------------------------------------------------------- +# Hashing +# --------------------------------------------------------------------------- + +def _sha256(s: str) -> str: + h = hashlib.sha256() + h.update(s.encode("utf-8")) + return h.hexdigest() + +def compute_hash(*, body: Optional[str], frontmatter: Optional[Dict[str, Any]], + mode: Optional[str] = None, normalize: Optional[str] = None) -> str: + """ + Berechnet einen Hex-Hash gemäß ``mode`` und ``normalize``. + mode: "body" | "frontmatter" | "body+frontmatter" + normalize: "canonical" | "none" + """ + mode = _resolve_hash_mode(mode) + normalize = (normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() + body_n = _normalize_body(body or "", normalize) + fm_s = _canon_frontmatter(frontmatter or {}) + + if mode == "frontmatter": + return _sha256(fm_s) + if mode == "body+frontmatter": + return _sha256(body_n + "\n--FM--\n" + fm_s) + # default: body + return _sha256(body_n) + +def compute_hash_set(*, body_parsed: str, body_raw: str, fm: Dict[str, Any], normalize: str) -> Dict[str, str]: + """Berechnet ein Set an Hashes für Monitoring/Debug.""" + fm_s = _canon_frontmatter(fm or {}) + bp = _normalize_body(body_parsed or "", normalize) + br = _normalize_body(body_raw or "", normalize) + return { + "frontmatter": _sha256(fm_s), + "body_parsed": _sha256(bp), + "body_raw": _sha256(br), + "full_parsed": _sha256(bp + "\n--FM--\n" + fm_s), + "full_raw": _sha256(br + "\n--FM--\n" + fm_s), + } + + +# --------------------------------------------------------------------------- +# Kernfunktion +# --------------------------------------------------------------------------- + +def make_note_payload( + parsed: Any, + vault_root: Optional[str] = None, + *, + hash_mode: Optional[str] = None, + hash_normalize: Optional[str] = None, + hash_source: Optional[str] = None, + file_path: Optional[str] = None, +) -> Dict[str, Any]: + """ + Erzeugt den Payload für eine geparste Note. + + Parameters + ---------- + parsed : Any + Objekt mit Attributen/Keys ``frontmatter``, ``body``, ``path``. + vault_root : Optional[str] + Vault-Wurzel (für Pfad-Relativierung). + hash_mode : Optional[str] + "body" | "frontmatter" | "body+frontmatter" | "full" (Alias; überschreibt ENV). + hash_normalize : Optional[str] + "canonical" | "none" (überschreibt ENV). + hash_source : Optional[str] + "parsed" (Default) oder "raw". Wenn "raw", wird der Body aus der Rohdatei gelesen. + file_path : Optional[str] + Pfad zur Markdown-Datei, erforderlich für ``hash_source=raw``. + + Returns + ------- + Dict[str, Any] + Qdrant-Payload für die Notes-Collection. + """ + # "Duck typing": dict oder Objekt akzeptieren + if isinstance(parsed, dict): + fm = parsed.get("frontmatter") or {} + body_parsed = parsed.get("body") or "" + path = parsed.get("path") or "" + else: + fm = getattr(parsed, "frontmatter", {}) or {} + body_parsed = getattr(parsed, "body", "") or "" + path = getattr(parsed, "path", "") or "" + + # Hash-Quelle bestimmen + src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower() + raw_body, raw_fm = ("", {}) + if src == "raw": + raw_body, raw_fm = _read_raw_body_from_file(file_path or path) + # Roh-FM ergänzen (nicht überschreiben) + if isinstance(raw_fm, dict) and raw_fm: + merged_fm = dict(fm) + for k, v in raw_fm.items(): + merged_fm.setdefault(k, v) + fm = merged_fm + + normalize = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() + mode_resolved = _resolve_hash_mode(hash_mode) + + # Hash gemäß Modus/Quelle bilden + body_for_hash = raw_body if src == "raw" else body_parsed + primary_hash = compute_hash(body=body_for_hash, frontmatter=fm, mode=mode_resolved, normalize=normalize) + hash_signature = f"{'full' if mode_resolved=='body+frontmatter' else mode_resolved}:{src}:{normalize}:{primary_hash}" + + # Pfad relativieren + rel_path = path + try: + if vault_root: + rel = os.path.relpath(path, vault_root) + rel = rel.replace("\\", "/").lstrip("/") # normalisieren + rel_path = rel + except Exception: + pass + + # Note-Level-Wikilinks (Fallback, wenn Chunks nicht geliefert werden) + note_level_refs = list(dict.fromkeys(extract_wikilinks(body_parsed))) if body_parsed else [] + + payload: Dict[str, Any] = { + "note_id": fm.get("id") or fm.get("note_id"), + "title": fm.get("title"), + "type": fm.get("type"), + "status": fm.get("status"), + "created": fm.get("created"), + "updated": fm.get("updated"), + "path": rel_path or fm.get("path"), + "tags": fm.get("tags"), + # Primärer Hash + Signatur (für Vergleich) + "hash_fulltext": primary_hash, + "hash_signature": hash_signature, + # Volltext persistieren (verlustfreie Rekonstruktion) – parsed Body + "fulltext": body_parsed, + # Fallback-Refs auf Note-Ebene + "references": note_level_refs, + } + + for k in ("area", "project", "source", "lang", "slug", "aliases"): + if k in fm: + payload[k] = fm[k] + + # Optional: gesamtes Hash-Set persistieren (Debug/Monitoring) + if os.environ.get("MINDNET_HASH_RECORD_ALL", "false").strip().lower() == "true": + payload["hashes"] = compute_hash_set(body_parsed=body_parsed, body_raw=raw_body, fm=fm, normalize=normalize) + + # Optional: Roh-Body-Hash separat (historische Kompatibilität) + if os.environ.get("MINDNET_HASH_STORE_RAW", "false").strip().lower() == "true" and src == "raw": + try: + payload["hash_raw_body"] = compute_hash(body=raw_body, frontmatter=fm, mode="body", normalize="none") + except Exception: + pass + + return payload + + +# --------------------------------------------------------------------------- +# CLI – Sichtprüfung +# --------------------------------------------------------------------------- + +def _cli() -> None: + ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen") + ap.add_argument("--from-file", dest="src", required=True, help="Pfad zur Markdown-Datei") + ap.add_argument("--vault-root", dest="vault_root", default=None, help="Vault-Wurzel zur Pfad-Relativierung") + ap.add_argument("--print", dest="do_print", action="store_true", help="Payload auf stdout ausgeben") + ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter", "full"], default=None) + ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None) + ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None) + args = ap.parse_args() + + parsed = read_markdown(args.src) + payload = make_note_payload(parsed, vault_root=args.vault_root, + hash_mode=args.hash_mode, hash_normalize=args.hash_normalize, + hash_source=args.hash_source, file_path=args.src) + + if args.do_print: + print(json.dumps(payload, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": # pragma: no cover + _cli() + Idempotenz wird der vollständige Body unter ``fulltext`` persistiert und der Dateipfad relativ zum Vault gespeichert. Das erlaubt eine verlustfreie Rekonstruktion im Export (erst ``fulltext``, sonst Chunks).