249 lines
8.3 KiB
Python
249 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
# Modul: app/core/note_payload.py
|
|
# Version: 1.6.1
|
|
# Datum: 2025-09-09
|
|
#
|
|
# Kurzbeschreibung
|
|
# ----------------
|
|
# Erzeugt den Qdrant-Payload für Notes inkl. robuster Hash-Bildung zur
|
|
# Änderungserkennung. Der vollständige Body wird unter "fulltext" persistiert;
|
|
# der Pfad ist relativ (für verlustfreien Export).
|
|
#
|
|
# Wichtige Punkte
|
|
# ---------------
|
|
# - Nur Inhalte fließen in den Hash ein (keine FS-Zeitstempel).
|
|
# - Vergleichsarten:
|
|
# Body -> nur Body
|
|
# Frontmatter -> nur Frontmatter
|
|
# Full -> Body + Frontmatter
|
|
# Steuerbar per CLI/ENV:
|
|
# --hash-mode body|frontmatter|full
|
|
# MINDNET_HASH_MODE / MINDNET_HASH_COMPARE (Body|Frontmatter|Full)
|
|
# - Hash-Quelle:
|
|
# parsed (Default) -> Parser-Body
|
|
# raw -> Rohdatei-Body (Frontmatter via Regex entfernt)
|
|
# Steuerbar per:
|
|
# --hash-source parsed|raw
|
|
# MINDNET_HASH_SOURCE
|
|
# - Normalisierung:
|
|
# canonical (Default) -> \r\n->\n, trailing spaces pro Zeile entfernt
|
|
# none -> keine Normalisierung (jede Kleinigkeit zählt)
|
|
# Steuerbar per:
|
|
# --hash-normalize canonical|none
|
|
# MINDNET_HASH_NORMALIZE
|
|
#
|
|
# Neu in v1.6.x
|
|
# -------------
|
|
# - "hash_signature" im Payload, z. B. "body:raw:none:<hex>".
|
|
# - Optional (ENV MINDNET_HASH_RECORD_ALL=true): zusätzliches Hash-Set für Debug:
|
|
# payload["hashes"] = {
|
|
# "body_parsed": "...", "body_raw": "...",
|
|
# "frontmatter": "...",
|
|
# "full_parsed": "...", "full_raw": "..."
|
|
# }
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import os
|
|
from typing import Any, Dict, Optional, Tuple
|
|
|
|
try:
|
|
from app.core.parser import read_markdown, extract_wikilinks, FRONTMATTER_RE
|
|
except Exception: # pragma: no cover
|
|
from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _canon_frontmatter(fm: Dict[str, Any]) -> str:
|
|
"""Kanonische JSON-Serialisierung der Frontmatter für Hashbildung."""
|
|
return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
|
|
|
|
def _normalize_body(body: str, mode: str) -> str:
|
|
"""Normalisiert den Body für reproduzierbare Hashes (oder nicht)."""
|
|
if mode == "none":
|
|
return body if body is not None else ""
|
|
text = (body or "").replace("\r\n", "\n").replace("\r", "\n")
|
|
text = "\n".join(line.rstrip() for line in text.split("\n"))
|
|
return text
|
|
|
|
def _resolve_hash_mode(explicit: Optional[str]) -> str:
|
|
"""
|
|
Normalisiert den Hash-Modus auf:
|
|
'body' | 'frontmatter' | 'body+frontmatter'
|
|
Akzeptiert 'full' als Alias.
|
|
Berücksichtigt ENV: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE.
|
|
"""
|
|
if explicit:
|
|
val = explicit.strip().lower()
|
|
else:
|
|
val = (os.environ.get("MINDNET_HASH_MODE")
|
|
or os.environ.get("MINDNET_HASH_COMPARE")
|
|
or "body").strip().lower()
|
|
if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"):
|
|
return "body+frontmatter"
|
|
if val in ("frontmatter", "fm"):
|
|
return "frontmatter"
|
|
return "body"
|
|
|
|
def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]:
|
|
"""Liest die Rohdatei und extrahiert Body & Frontmatter ohne Parser-Logik.
|
|
|
|
Rückgabe:
|
|
(body_text, frontmatter_dict)
|
|
"""
|
|
if not file_path or not os.path.exists(file_path):
|
|
return "", {}
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
raw = f.read()
|
|
except Exception:
|
|
return "", {}
|
|
# Frontmatter per Regex entfernen
|
|
m = FRONTMATTER_RE.match(raw)
|
|
fm = {}
|
|
if m:
|
|
fm_txt = m.group(1)
|
|
try:
|
|
import yaml # lazy
|
|
fm = yaml.safe_load(fm_txt) or {}
|
|
except Exception:
|
|
fm = {}
|
|
body = raw[m.end():]
|
|
else:
|
|
body = raw
|
|
return body, fm
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Hashing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _sha256(s: str) -> str:
|
|
h = hashlib.sha256()
|
|
h.update(s.encode("utf-8"))
|
|
return h.hexdigest()
|
|
|
|
def compute_hash(
|
|
*,
|
|
body: Optional[str],
|
|
frontmatter: Optional[Dict[str, Any]],
|
|
mode: Optional[str] = None,
|
|
normalize: Optional[str] = None,
|
|
) -> str:
|
|
"""
|
|
Berechnet einen Hex-Hash gemäß 'mode' und 'normalize'.
|
|
|
|
mode: "body" | "frontmatter" | "body+frontmatter"
|
|
normalize: "canonical" | "none"
|
|
"""
|
|
mode = _resolve_hash_mode(mode)
|
|
normalize = (normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower()
|
|
body_n = _normalize_body(body or "", normalize)
|
|
fm_s = _canon_frontmatter(frontmatter or {})
|
|
|
|
if mode == "frontmatter":
|
|
return _sha256(fm_s)
|
|
if mode == "body+frontmatter":
|
|
return _sha256(body_n + "\n--FM--\n" + fm_s)
|
|
# default: body
|
|
return _sha256(body_n)
|
|
|
|
def compute_hash_set(*, body_parsed: str, body_raw: str, fm: Dict[str, Any], normalize: str) -> Dict[str, str]:
|
|
"""Berechnet ein Set an Hashes für Monitoring/Debug."""
|
|
fm_s = _canon_frontmatter(fm or {})
|
|
bp = _normalize_body(body_parsed or "", normalize)
|
|
br = _normalize_body(body_raw or "", normalize)
|
|
return {
|
|
"frontmatter": _sha256(fm_s),
|
|
"body_parsed": _sha256(bp),
|
|
"body_raw": _sha256(br),
|
|
"full_parsed": _sha256(bp + "\n--FM--\n" + fm_s),
|
|
"full_raw": _sha256(br + "\n--FM--\n" + fm_s),
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Kernfunktion
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def make_note_payload(
|
|
parsed: Any,
|
|
vault_root: Optional[str] = None,
|
|
*,
|
|
hash_mode: Optional[str] = None,
|
|
hash_normalize: Optional[str] = None,
|
|
hash_source: Optional[str] = None,
|
|
file_path: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Erzeugt den Payload für eine geparste Note.
|
|
|
|
Parameters
|
|
----------
|
|
parsed : Any
|
|
Objekt mit Attributen/Keys 'frontmatter', 'body', 'path'.
|
|
vault_root : Optional[str]
|
|
Vault-Wurzel (für Pfad-Relativierung).
|
|
hash_mode : Optional[str]
|
|
"body" | "frontmatter" | "body+frontmatter" | "full" (Alias; überschreibt ENV).
|
|
hash_normalize : Optional[str]
|
|
"canonical" | "none" (überschreibt ENV).
|
|
hash_source : Optional[str]
|
|
"parsed" (Default) oder "raw". Wenn "raw", wird der Body aus der Rohdatei gelesen.
|
|
file_path : Optional[str]
|
|
Pfad zur Markdown-Datei, erforderlich für 'hash_source=raw'.
|
|
|
|
Returns
|
|
-------
|
|
Dict[str, Any]
|
|
Qdrant-Payload für die Notes-Collection.
|
|
"""
|
|
# dict oder Objekt akzeptieren
|
|
if isinstance(parsed, dict):
|
|
fm = parsed.get("frontmatter") or {}
|
|
body_parsed = parsed.get("body") or ""
|
|
path = parsed.get("path") or ""
|
|
else:
|
|
fm = getattr(parsed, "frontmatter", {}) or {}
|
|
body_parsed = getattr(parsed, "body", "") or ""
|
|
path = getattr(parsed, "path", "") or ""
|
|
|
|
# Hash-Quelle bestimmen
|
|
src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower()
|
|
raw_body, raw_fm = ("", {})
|
|
if src == "raw":
|
|
raw_body, raw_fm = _read_raw_body_from_file(file_path or path)
|
|
# Roh-FM ergänzen (nicht überschreiben)
|
|
if isinstance(raw_fm, dict) and raw_fm:
|
|
merged_fm = dict(fm)
|
|
for k, v in raw_fm.items():
|
|
merged_fm.setdefault(k, v)
|
|
fm = merged_fm
|
|
|
|
normalize = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower()
|
|
mode_resolved = _resolve_hash_mode(hash_mode)
|
|
|
|
# Hash gemäß Modus/Quelle bilden
|
|
body_for_hash = raw_body if src == "raw" else body_parsed
|
|
primary_hash = compute_hash(body=body_for_hash, frontmatter=fm, mode=mode_resolved, normalize=normalize)
|
|
hash_signature = f"{'full' if mode_resolved=='body+frontmatter' else mode_resolved}:{src}:{normalize}:{primary_hash}"
|
|
|
|
# Pfad relativieren
|
|
rel_path = path
|
|
try:
|
|
if vault_root:
|
|
rel = os.path.relpath(path, vault_root)
|
|
rel = rel.replace("\\", "/").lstrip("/") # normalisieren
|
|
rel_path = rel
|
|
except Exception:
|
|
pass
|
|
|
|
# Note-Level-Wikilinks
|