#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Modul: app/core/note_payload.py # Version: 1.6.1 # Datum: 2025-09-09 # # Kurzbeschreibung # ---------------- # Erzeugt den Qdrant-Payload für Notes inkl. robuster Hash-Bildung zur # Änderungserkennung. Der vollständige Body wird unter "fulltext" persistiert; # der Pfad ist relativ (für verlustfreien Export). # # Wichtige Punkte # --------------- # - Nur Inhalte fließen in den Hash ein (keine FS-Zeitstempel). # - Vergleichsarten: # Body -> nur Body # Frontmatter -> nur Frontmatter # Full -> Body + Frontmatter # Steuerbar per CLI/ENV: # --hash-mode body|frontmatter|full # MINDNET_HASH_MODE / MINDNET_HASH_COMPARE (Body|Frontmatter|Full) # - Hash-Quelle: # parsed (Default) -> Parser-Body # raw -> Rohdatei-Body (Frontmatter via Regex entfernt) # Steuerbar per: # --hash-source parsed|raw # MINDNET_HASH_SOURCE # - Normalisierung: # canonical (Default) -> \r\n->\n, trailing spaces pro Zeile entfernt # none -> keine Normalisierung (jede Kleinigkeit zählt) # Steuerbar per: # --hash-normalize canonical|none # MINDNET_HASH_NORMALIZE # # Neu in v1.6.x # ------------- # - "hash_signature" im Payload, z. B. "body:raw:none:". # - Optional (ENV MINDNET_HASH_RECORD_ALL=true): zusätzliches Hash-Set für Debug: # payload["hashes"] = { # "body_parsed": "...", "body_raw": "...", # "frontmatter": "...", # "full_parsed": "...", "full_raw": "..." # } from __future__ import annotations import argparse import hashlib import json import os from typing import Any, Dict, Optional, Tuple try: from app.core.parser import read_markdown, extract_wikilinks, FRONTMATTER_RE except Exception: # pragma: no cover from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _canon_frontmatter(fm: Dict[str, Any]) -> str: """Kanonische JSON-Serialisierung der Frontmatter für Hashbildung.""" return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True) def _normalize_body(body: str, mode: str) -> str: """Normalisiert den Body für reproduzierbare Hashes (oder nicht).""" if mode == "none": return body if body is not None else "" text = (body or "").replace("\r\n", "\n").replace("\r", "\n") text = "\n".join(line.rstrip() for line in text.split("\n")) return text def _resolve_hash_mode(explicit: Optional[str]) -> str: """ Normalisiert den Hash-Modus auf: 'body' | 'frontmatter' | 'body+frontmatter' Akzeptiert 'full' als Alias. Berücksichtigt ENV: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE. """ if explicit: val = explicit.strip().lower() else: val = (os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower() if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"): return "body+frontmatter" if val in ("frontmatter", "fm"): return "frontmatter" return "body" def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]: """Liest die Rohdatei und extrahiert Body & Frontmatter ohne Parser-Logik. Rückgabe: (body_text, frontmatter_dict) """ if not file_path or not os.path.exists(file_path): return "", {} try: with open(file_path, "r", encoding="utf-8") as f: raw = f.read() except Exception: return "", {} # Frontmatter per Regex entfernen m = FRONTMATTER_RE.match(raw) fm = {} if m: fm_txt = m.group(1) try: import yaml # lazy fm = yaml.safe_load(fm_txt) or {} except Exception: fm = {} body = raw[m.end():] else: body = raw return body, fm # --------------------------------------------------------------------------- # Hashing # --------------------------------------------------------------------------- def _sha256(s: str) -> str: h = hashlib.sha256() h.update(s.encode("utf-8")) return h.hexdigest() def compute_hash( *, body: Optional[str], frontmatter: Optional[Dict[str, Any]], mode: Optional[str] = None, normalize: Optional[str] = None, ) -> str: """ Berechnet einen Hex-Hash gemäß 'mode' und 'normalize'. mode: "body" | "frontmatter" | "body+frontmatter" normalize: "canonical" | "none" """ mode = _resolve_hash_mode(mode) normalize = (normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() body_n = _normalize_body(body or "", normalize) fm_s = _canon_frontmatter(frontmatter or {}) if mode == "frontmatter": return _sha256(fm_s) if mode == "body+frontmatter": return _sha256(body_n + "\n--FM--\n" + fm_s) # default: body return _sha256(body_n) def compute_hash_set(*, body_parsed: str, body_raw: str, fm: Dict[str, Any], normalize: str) -> Dict[str, str]: """Berechnet ein Set an Hashes für Monitoring/Debug.""" fm_s = _canon_frontmatter(fm or {}) bp = _normalize_body(body_parsed or "", normalize) br = _normalize_body(body_raw or "", normalize) return { "frontmatter": _sha256(fm_s), "body_parsed": _sha256(bp), "body_raw": _sha256(br), "full_parsed": _sha256(bp + "\n--FM--\n" + fm_s), "full_raw": _sha256(br + "\n--FM--\n" + fm_s), } # --------------------------------------------------------------------------- # Kernfunktion # --------------------------------------------------------------------------- def make_note_payload( parsed: Any, vault_root: Optional[str] = None, *, hash_mode: Optional[str] = None, hash_normalize: Optional[str] = None, hash_source: Optional[str] = None, file_path: Optional[str] = None, ) -> Dict[str, Any]: """ Erzeugt den Payload für eine geparste Note. Parameters ---------- parsed : Any Objekt mit Attributen/Keys 'frontmatter', 'body', 'path'. vault_root : Optional[str] Vault-Wurzel (für Pfad-Relativierung). hash_mode : Optional[str] "body" | "frontmatter" | "body+frontmatter" | "full" (Alias; überschreibt ENV). hash_normalize : Optional[str] "canonical" | "none" (überschreibt ENV). hash_source : Optional[str] "parsed" (Default) oder "raw". Wenn "raw", wird der Body aus der Rohdatei gelesen. file_path : Optional[str] Pfad zur Markdown-Datei, erforderlich für 'hash_source=raw'. Returns ------- Dict[str, Any] Qdrant-Payload für die Notes-Collection. """ # dict oder Objekt akzeptieren if isinstance(parsed, dict): fm = parsed.get("frontmatter") or {} body_parsed = parsed.get("body") or "" path = parsed.get("path") or "" else: fm = getattr(parsed, "frontmatter", {}) or {} body_parsed = getattr(parsed, "body", "") or "" path = getattr(parsed, "path", "") or "" # Hash-Quelle bestimmen src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower() raw_body, raw_fm = ("", {}) if src == "raw": raw_body, raw_fm = _read_raw_body_from_file(file_path or path) # Roh-FM ergänzen (nicht überschreiben) if isinstance(raw_fm, dict) and raw_fm: merged_fm = dict(fm) for k, v in raw_fm.items(): merged_fm.setdefault(k, v) fm = merged_fm normalize = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() mode_resolved = _resolve_hash_mode(hash_mode) # Hash gemäß Modus/Quelle bilden body_for_hash = raw_body if src == "raw" else body_parsed primary_hash = compute_hash(body=body_for_hash, frontmatter=fm, mode=mode_resolved, normalize=normalize) hash_signature = f"{'full' if mode_resolved=='body+frontmatter' else mode_resolved}:{src}:{normalize}:{primary_hash}" # Pfad relativieren rel_path = path try: if vault_root: rel = os.path.relpath(path, vault_root) rel = rel.replace("\\", "/").lstrip("/") # normalisieren rel_path = rel except Exception: pass # Note-Level-Wikilinks