From 4872374a6e7de65531bf0b7e294f28f3523df638 Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 9 Sep 2025 19:42:09 +0200 Subject: [PATCH] app/core/note_payload.py aktualisiert --- app/core/note_payload.py | 222 +++++++++++++-------------------------- 1 file changed, 71 insertions(+), 151 deletions(-) diff --git a/app/core/note_payload.py b/app/core/note_payload.py index a66ddea..f60db87 100644 --- a/app/core/note_payload.py +++ b/app/core/note_payload.py @@ -1,36 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # Modul: app/core/note_payload.py -# Version: 1.6.3 +# Version: 1.7.0 # Datum: 2025-09-09 -# -# Zweck -# ----- -# Erzeugt den Qdrant-Payload für Notes inkl. robuster Hash-Bildung zur -# Änderungserkennung. Der vollständige (parsed) Body wird unter "fulltext" -# persistiert; der Pfad ist relativ (für verlustfreien Export). -# -# Steuerung (CLI/ENV, vom Importer durchgereicht) -# ----------------------------------------------- -# - Vergleichsmodus: -# --hash-mode body|frontmatter|full -# ENV: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE (Body|Frontmatter|Full) -# "full" ist Alias für "body+frontmatter". -# - Hash-Quelle: -# --hash-source parsed|raw (ENV: MINDNET_HASH_SOURCE) -# - Normalisierung: -# --hash-normalize canonical|none (ENV: MINDNET_HASH_NORMALIZE) -# -# Payload-Felder (Auszug) -# ----------------------- -# note_id, title, type, status, created, updated, path, tags, -# fulltext, references (Note-Level-Wikilinks), -# hash_fulltext (Primärhash), hash_signature (z. B. "body:raw:none:") -# -# Hinweise -# -------- -# - Keine Abhängigkeit von FS-Zeitstempeln; nur Inhalte fließen in den Hash ein. -# - Abwärtskompatibel: Feldernamen bleiben stabil; zusätzliche Felder stören nicht. from __future__ import annotations @@ -45,17 +17,14 @@ try: except Exception: # pragma: no cover from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _canon_frontmatter(fm: Dict[str, Any]) -> str: - """Kanonische JSON-Serialisierung der Frontmatter für Hashbildung.""" return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True) def _normalize_body(body: str, mode: str) -> str: - """Normalisiert den Body für reproduzierbare Hashes (oder nicht).""" if mode == "none": return body if body is not None else "" text = (body or "").replace("\r\n", "\n").replace("\r", "\n") @@ -63,12 +32,6 @@ def _normalize_body(body: str, mode: str) -> str: return text def _resolve_hash_mode(explicit: Optional[str]) -> str: - """ - Normalisiert den Hash-Modus auf: - 'body' | 'frontmatter' | 'body+frontmatter' - Akzeptiert 'full' als Alias. - Berücksichtigt ENV: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE. - """ if explicit: val = explicit.strip().lower() else: @@ -76,17 +39,12 @@ def _resolve_hash_mode(explicit: Optional[str]) -> str: or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower() if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"): - return "body+frontmatter" + return "full" if val in ("frontmatter", "fm"): return "frontmatter" return "body" def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]: - """Liest die Rohdatei und extrahiert Body & Frontmatter ohne Parser-Logik. - - Rückgabe: - (body_text, frontmatter_dict) - """ if not file_path or not os.path.exists(file_path): return "", {} try: @@ -94,7 +52,6 @@ def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, A raw = f.read() except Exception: return "", {} - # Frontmatter per Regex entfernen m = FRONTMATTER_RE.match(raw) fm = {} if m: @@ -109,55 +66,21 @@ def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, A body = raw return body, fm - -# --------------------------------------------------------------------------- -# Hashing -# --------------------------------------------------------------------------- - def _sha256(s: str) -> str: h = hashlib.sha256() h.update(s.encode("utf-8")) return h.hexdigest() -def compute_hash( - *, - body: Optional[str], - frontmatter: Optional[Dict[str, Any]], - mode: Optional[str] = None, - normalize: Optional[str] = None, -) -> str: - """ - Berechnet einen Hex-Hash gemäß 'mode' und 'normalize'. - - mode: "body" | "frontmatter" | "body+frontmatter" - normalize: "canonical" | "none" - """ - mode = _resolve_hash_mode(mode) - normalize = (normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() +def _hash_for(mode: str, *, body: str, fm: Dict[str, Any], normalize: str) -> str: body_n = _normalize_body(body or "", normalize) - fm_s = _canon_frontmatter(frontmatter or {}) - + fm_s = _canon_frontmatter(fm or {}) if mode == "frontmatter": return _sha256(fm_s) - if mode == "body+frontmatter": + if mode == "full": return _sha256(body_n + "\n--FM--\n" + fm_s) # default: body return _sha256(body_n) -def compute_hash_set(*, body_parsed: str, body_raw: str, fm: Dict[str, Any], normalize: str) -> Dict[str, str]: - """Berechnet ein Set an Hashes für Monitoring/Debug.""" - fm_s = _canon_frontmatter(fm or {}) - bp = _normalize_body(body_parsed or "", normalize) - br = _normalize_body(body_raw or "", normalize) - return { - "frontmatter": _sha256(fm_s), - "body_parsed": _sha256(bp), - "body_raw": _sha256(br), - "full_parsed": _sha256(bp + "\n--FM--\n" + fm_s), - "full_raw": _sha256(br + "\n--FM--\n" + fm_s), - } - - # --------------------------------------------------------------------------- # Kernfunktion # --------------------------------------------------------------------------- @@ -172,27 +95,13 @@ def make_note_payload( file_path: Optional[str] = None, ) -> Dict[str, Any]: """ - Erzeugt den Payload für eine geparste Note. - - Parameters - ---------- - parsed : Any - Objekt mit Attributen/Keys 'frontmatter', 'body', 'path'. - vault_root : Optional[str] - Vault-Wurzel (für Pfad-Relativierung). - hash_mode : Optional[str] - "body" | "frontmatter" | "body+frontmatter" | "full" (Alias; überschreibt ENV). - hash_normalize : Optional[str] - "canonical" | "none" (überschreibt ENV). - hash_source : Optional[str] - "parsed" (Default) oder "raw". Wenn "raw", wird der Body aus der Rohdatei gelesen. - file_path : Optional[str] - Pfad zur Markdown-Datei, erforderlich für 'hash_source=raw'. - - Returns - ------- - Dict[str, Any] - Qdrant-Payload für die Notes-Collection. + Liefert den Note-Payload inkl. Mehrfach-Hashes. + - Es werden IMMER die drei Hashes für (body|frontmatter|full) unter + 'parsed:canonical' erzeugt (Schlüssel: z. B. 'body:parsed:canonical'). + - Zusätzlich werden – falls die aktuelle Konfig (source/normalize) davon + abweicht – die drei Hashes unter den entsprechenden Schlüsseln erzeugt, + z. B. 'frontmatter:raw:none'. + - 'hash_fulltext' und 'hash_signature' repräsentieren den *aktuellen* Modus. """ # dict oder Objekt akzeptieren if isinstance(parsed, dict): @@ -204,38 +113,62 @@ def make_note_payload( body_parsed = getattr(parsed, "body", "") or "" path = getattr(parsed, "path", "") or "" - # Hash-Quelle bestimmen - src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower() + # Zielpfad relativieren + rel_path = path + try: + if vault_root: + rel = os.path.relpath(path, vault_root) + rel = rel.replace("\\", "/").lstrip("/") + rel_path = rel + except Exception: + pass + + # Konfiguration auflösen + mode_resolved = _resolve_hash_mode(hash_mode) # body|frontmatter|full + src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower() # parsed|raw + norm = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() # canonical|none + + # Body-Quelle laden raw_body, raw_fm = ("", {}) if src == "raw": raw_body, raw_fm = _read_raw_body_from_file(file_path or path) - # Roh-FM ergänzen (nicht überschreiben) if isinstance(raw_fm, dict) and raw_fm: merged_fm = dict(fm) for k, v in raw_fm.items(): merged_fm.setdefault(k, v) fm = merged_fm + body_for_hash = raw_body + else: + body_for_hash = body_parsed - normalize = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() - mode_resolved = _resolve_hash_mode(hash_mode) + # --- 1) Standard-Tripel (parsed:canonical) immer erzeugen --- + std_src = "parsed" + std_norm = "canonical" + std_hashes: Dict[str, str] = {} + for m in ("body", "frontmatter", "full"): + std_hashes[f"{m}:{std_src}:{std_norm}"] = _hash_for( + m, body=body_parsed, fm=fm, normalize=std_norm + ) - # Hash gemäß Modus/Quelle bilden - body_for_hash = raw_body if src == "raw" else body_parsed - primary_hash = compute_hash(body=body_for_hash, frontmatter=fm, mode=mode_resolved, normalize=normalize) - hash_signature = f"{'full' if mode_resolved=='body+frontmatter' else mode_resolved}:{src}:{normalize}:{primary_hash}" + # Convenience-Felder (für Tools) + hash_body = std_hashes["body:parsed:canonical"] + hash_frontmatter = std_hashes["frontmatter:parsed:canonical"] + hash_full = std_hashes["full:parsed:canonical"] - # Pfad relativieren - rel_path = path - try: - if vault_root: - rel = os.path.relpath(path, vault_root) - rel = rel.replace("\\", "/").lstrip("/") # normalisieren - rel_path = rel - except Exception: - pass + # --- 2) Hashes für die *aktuelle* Konfiguration (falls abweichend) --- + cur_hashes: Dict[str, str] = {} + if not (src == std_src and norm == std_norm): + for m in ("body", "frontmatter", "full"): + cur_hashes[f"{m}:{src}:{norm}"] = _hash_for( + m, body=body_for_hash, fm=fm, normalize=norm + ) - # Note-Level-Wikilinks (Fallback, wenn Chunks nicht geliefert werden) - note_level_refs = list(dict.fromkeys(extract_wikilinks(body_parsed))) if body_parsed else [] + # --- 3) Aktueller Modus für Backwards-Compat Felder --- + current_hash = _hash_for(mode_resolved, body=body_for_hash, fm=fm, normalize=norm) + hash_signature = f"{mode_resolved}:{src}:{norm}:{current_hash}" + + # Wikilinks (Note-Ebene) + refs = list(dict.fromkeys(extract_wikilinks(body_parsed))) if body_parsed else [] payload: Dict[str, Any] = { "note_id": fm.get("id") or fm.get("note_id"), @@ -246,47 +179,36 @@ def make_note_payload( "updated": fm.get("updated"), "path": rel_path or fm.get("path"), "tags": fm.get("tags"), - # Primärer Hash + Signatur (für Vergleich) - "hash_fulltext": primary_hash, - "hash_signature": hash_signature, - # Volltext persistieren (verlustfreie Rekonstruktion) – parsed Body + # Volltext für verlustfreien Export "fulltext": body_parsed, - # Fallback-Refs auf Note-Ebene - "references": note_level_refs, + # Backwards-Compat: + "hash_fulltext": current_hash, + "hash_signature": hash_signature, + # Option C: Mehrfach-Hashes + "hashes": {**std_hashes, **cur_hashes}, + "hash_body": hash_body, + "hash_frontmatter": hash_frontmatter, + "hash_full": hash_full, + # Fallback-Refs + "references": refs, } for k in ("area", "project", "source", "lang", "slug", "aliases"): if k in fm: payload[k] = fm[k] - # Optional: gesamtes Hash-Set persistieren (Debug/Monitoring) - if os.environ.get("MINDNET_HASH_RECORD_ALL", "false").strip().lower() == "true": - payload["hashes"] = compute_hash_set( - body_parsed=body_parsed, body_raw=raw_body, fm=fm, normalize=normalize - ) - - # Optional: Roh-Body-Hash separat (historische Kompatibilität) - if os.environ.get("MINDNET_HASH_STORE_RAW", "false").strip().lower() == "true" and src == "raw": - try: - payload["hash_raw_body"] = compute_hash( - body=raw_body, frontmatter=fm, mode="body", normalize="none" - ) - except Exception: - pass - return payload - # --------------------------------------------------------------------------- # CLI – Sichtprüfung # --------------------------------------------------------------------------- def _cli() -> None: ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen") - ap.add_argument("--from-file", dest="src", required=True, help="Pfad zur Markdown-Datei") - ap.add_argument("--vault-root", dest="vault_root", default=None, help="Vault-Wurzel zur Pfad-Relativierung") - ap.add_argument("--print", dest="do_print", action="store_true", help="Payload auf stdout ausgeben") - ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter", "full"], default=None) + ap.add_argument("--from-file", dest="src", required=True) + ap.add_argument("--vault-root", dest="vault_root", default=None) + ap.add_argument("--print", dest="do_print", action="store_true") + ap.add_argument("--hash-mode", choices=["body", "frontmatter", "full"], default=None) ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None) ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None) args = ap.parse_args() @@ -300,10 +222,8 @@ def _cli() -> None: hash_source=args.hash_source, file_path=args.src, ) - if args.do_print: print(json.dumps(payload, ensure_ascii=False, indent=2)) - if __name__ == "__main__": # pragma: no cover _cli()