mindnet/scripts/debug_note_payload.py
Lars 65c40b287f
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
scripts/debug_note_payload.py aktualisiert
2025-09-09 16:48:41 +02:00

310 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Modul: app/core/note_payload.py
# Version: 1.6.2
# Datum: 2025-09-09
#
# Zweck
# -----
# Erzeugt den Qdrant-Payload für Notes inkl. robuster Hash-Bildung zur
# Änderungserkennung. Der vollständige (parsed) Body wird unter "fulltext"
# persistiert; der Pfad ist relativ (für verlustfreien Export).
#
# Steuerung (CLI/ENV, vom Importer durchgereicht)
# -----------------------------------------------
# - Vergleichsmodus:
# --hash-mode body|frontmatter|full
# ENV: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE (Body|Frontmatter|Full)
# "full" ist Alias für "body+frontmatter".
# - Hash-Quelle:
# --hash-source parsed|raw (ENV: MINDNET_HASH_SOURCE)
# - Normalisierung:
# --hash-normalize canonical|none (ENV: MINDNET_HASH_NORMALIZE)
#
# Payload-Felder (Auszug)
# -----------------------
# note_id, title, type, status, created, updated, path, tags,
# fulltext, references (Note-Level-Wikilinks),
# hash_fulltext (Primärhash), hash_signature (z. B. "body:raw:none:<hex>")
#
# Hinweise
# --------
# - Keine Abhängigkeit von FS-Zeitstempeln; nur Inhalte fließen in den Hash ein.
# - Abwärtskompatibel: Feldernamen bleiben stabil; zusätzliche Felder stören nicht.
from __future__ import annotations
import argparse
import hashlib
import json
import os
from typing import Any, Dict, Optional, Tuple
try:
from app.core.parser import read_markdown, extract_wikilinks, FRONTMATTER_RE
except Exception: # pragma: no cover
from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _canon_frontmatter(fm: Dict[str, Any]) -> str:
"""Kanonische JSON-Serialisierung der Frontmatter für Hashbildung."""
return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
def _normalize_body(body: str, mode: str) -> str:
"""Normalisiert den Body für reproduzierbare Hashes (oder nicht)."""
if mode == "none":
return body if body is not None else ""
text = (body or "").replace("\r\n", "\n").replace("\r", "\n")
text = "\n".join(line.rstrip() for line in text.split("\n"))
return text
def _resolve_hash_mode(explicit: Optional[str]) -> str:
"""
Normalisiert den Hash-Modus auf:
'body' | 'frontmatter' | 'body+frontmatter'
Akzeptiert 'full' als Alias.
Berücksichtigt ENV: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE.
"""
if explicit:
val = explicit.strip().lower()
else:
val = (os.environ.get("MINDNET_HASH_MODE")
or os.environ.get("MINDNET_HASH_COMPARE")
or "body").strip().lower()
if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"):
return "body+frontmatter"
if val in ("frontmatter", "fm"):
return "frontmatter"
return "body"
def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]:
"""Liest die Rohdatei und extrahiert Body & Frontmatter ohne Parser-Logik.
Rückgabe:
(body_text, frontmatter_dict)
"""
if not file_path or not os.path.exists(file_path):
return "", {}
try:
with open(file_path, "r", encoding="utf-8") as f:
raw = f.read()
except Exception:
return "", {}
# Frontmatter per Regex entfernen
m = FRONTMATTER_RE.match(raw)
fm = {}
if m:
fm_txt = m.group(1)
try:
import yaml # lazy
fm = yaml.safe_load(fm_txt) or {}
except Exception:
fm = {}
body = raw[m.end():]
else:
body = raw
return body, fm
# ---------------------------------------------------------------------------
# Hashing
# ---------------------------------------------------------------------------
def _sha256(s: str) -> str:
h = hashlib.sha256()
h.update(s.encode("utf-8"))
return h.hexdigest()
def compute_hash(
*,
body: Optional[str],
frontmatter: Optional[Dict[str, Any]],
mode: Optional[str] = None,
normalize: Optional[str] = None,
) -> str:
"""
Berechnet einen Hex-Hash gemäß 'mode' und 'normalize'.
mode: "body" | "frontmatter" | "body+frontmatter"
normalize: "canonical" | "none"
"""
mode = _resolve_hash_mode(mode)
normalize = (normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower()
body_n = _normalize_body(body or "", normalize)
fm_s = _canon_frontmatter(frontmatter or {})
if mode == "frontmatter":
return _sha256(fm_s)
if mode == "body+frontmatter":
return _sha256(body_n + "\n--FM--\n" + fm_s)
# default: body
return _sha256(body_n)
def compute_hash_set(*, body_parsed: str, body_raw: str, fm: Dict[str, Any], normalize: str) -> Dict[str, str]:
"""Berechnet ein Set an Hashes für Monitoring/Debug."""
fm_s = _canon_frontmatter(fm or {})
bp = _normalize_body(body_parsed or "", normalize)
br = _normalize_body(body_raw or "", normalize)
return {
"frontmatter": _sha256(fm_s),
"body_parsed": _sha256(bp),
"body_raw": _sha256(br),
"full_parsed": _sha256(bp + "\n--FM--\n" + fm_s),
"full_raw": _sha256(br + "\n--FM--\n" + fm_s),
}
# ---------------------------------------------------------------------------
# Kernfunktion
# ---------------------------------------------------------------------------
def make_note_payload(
parsed: Any,
vault_root: Optional[str] = None,
*,
hash_mode: Optional[str] = None,
hash_normalize: Optional[str] = None,
hash_source: Optional[str] = None,
file_path: Optional[str] = None,
) -> Dict[str, Any]:
"""
Erzeugt den Payload für eine geparste Note.
Parameters
----------
parsed : Any
Objekt mit Attributen/Keys 'frontmatter', 'body', 'path'.
vault_root : Optional[str]
Vault-Wurzel (für Pfad-Relativierung).
hash_mode : Optional[str]
"body" | "frontmatter" | "body+frontmatter" | "full" (Alias; überschreibt ENV).
hash_normalize : Optional[str]
"canonical" | "none" (überschreibt ENV).
hash_source : Optional[str]
"parsed" (Default) oder "raw". Wenn "raw", wird der Body aus der Rohdatei gelesen.
file_path : Optional[str]
Pfad zur Markdown-Datei, erforderlich für 'hash_source=raw'.
Returns
-------
Dict[str, Any]
Qdrant-Payload für die Notes-Collection.
"""
# dict oder Objekt akzeptieren
if isinstance(parsed, dict):
fm = parsed.get("frontmatter") or {}
body_parsed = parsed.get("body") or ""
path = parsed.get("path") or ""
else:
fm = getattr(parsed, "frontmatter", {}) or {}
body_parsed = getattr(parsed, "body", "") or ""
path = getattr(parsed, "path", "") or ""
# Hash-Quelle bestimmen
src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower()
raw_body, raw_fm = ("", {})
if src == "raw":
raw_body, raw_fm = _read_raw_body_from_file(file_path or path)
# Roh-FM ergänzen (nicht überschreiben)
if isinstance(raw_fm, dict) and raw_fm:
merged_fm = dict(fm)
for k, v in raw_fm.items():
merged_fm.setdefault(k, v)
fm = merged_fm
normalize = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower()
mode_resolved = _resolve_hash_mode(hash_mode)
# Hash gemäß Modus/Quelle bilden
body_for_hash = raw_body if src == "raw" else body_parsed
primary_hash = compute_hash(body=body_for_hash, frontmatter=fm, mode=mode_resolved, normalize=normalize)
hash_signature = f"{'full' if mode_resolved=='body+frontmatter' else mode_resolved}:{src}:{normalize}:{primary_hash}"
# Pfad relativieren
rel_path = path
try:
if vault_root:
rel = os.path.relpath(path, vault_root)
rel = rel.replace("\\", "/").lstrip("/") # normalisieren
rel_path = rel
except Exception:
pass
# Note-Level-Wikilinks (Fallback, wenn Chunks nicht geliefert werden)
note_level_refs = list(dict.fromkeys(extract_wikilinks(body_parsed))) if body_parsed else []
payload: Dict[str, Any] = {
"note_id": fm.get("id") or fm.get("note_id"),
"title": fm.get("title"),
"type": fm.get("type"),
"status": fm.get("status"),
"created": fm.get("created"),
"updated": fm.get("updated"),
"path": rel_path or fm.get("path"),
"tags": fm.get("tags"),
# Primärer Hash + Signatur (für Vergleich)
"hash_fulltext": primary_hash,
"hash_signature": hash_signature,
# Volltext persistieren (verlustfreie Rekonstruktion) parsed Body
"fulltext": body_parsed,
# Fallback-Refs auf Note-Ebene
"references": note_level_refs,
}
for k in ("area", "project", "source", "lang", "slug", "aliases"):
if k in fm:
payload[k] = fm[k]
# Optional: gesamtes Hash-Set persistieren (Debug/Monitoring)
if os.environ.get("MINDNET_HASH_RECORD_ALL", "false").strip().lower() == "true":
payload["hashes"] = compute_hash_set(
body_parsed=body_parsed, body_raw=raw_body, fm=fm, normalize=normalize
)
# Optional: Roh-Body-Hash separat (historische Kompatibilität)
if os.environ.get("MINDNET_HASH_STORE_RAW", "false").strip().lower() == "true" and src == "raw":
try:
payload["hash_raw_body"] = compute_hash(
body=raw_body, frontmatter=fm, mode="body", normalize="none"
)
except Exception:
pass
return payload
# ---------------------------------------------------------------------------
# CLI Sichtprüfung
# ---------------------------------------------------------------------------
def _cli() -> None:
ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen")
ap.add_argument("--from-file", dest="src", required=True, help="Pfad zur Markdown-Datei")
ap.add_argument("--vault-root", dest="vault_root", default=None, help="Vault-Wurzel zur Pfad-Relativierung")
ap.add_argument("--print", dest="do_print", action="store_true", help="Payload auf stdout ausgeben")
ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter", "full"], default=None)
ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None)
args = ap.parse_args()
parsed = read_markdown(args.src)
payload = make_note_payload(
parsed,
vault_root=args.vault_root,
hash_mode=args.hash_mode,
hash_normalize=args.hash_normalize,
hash_source=args.hash_source,
file_path=args.src,
)
if args.do_print:
print(json.dumps(payload, ensure_ascii=False, indent=2))
if __name__ == "__main__": # pragma: no cover
_cli()