mindnet/app/core/note_payload.py
Lars 4637091125
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
app/core/note_payload.py aktualisiert
2025-09-09 16:31:25 +02:00

597 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Modul: app/core/note_payload.py
Version: 1.5.1
Datum: 2025-09-09
Kurzbeschreibung
----------------
Erzeugt den Qdrant-Payload für **Notes**. Neben stabiler Hash-Bildung zur#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Modul: app/core/note_payload.py
Version: 1.6.0
Datum: 2025-09-09
Kurzbeschreibung
----------------
Erzeugt den Qdrant-Payload für **Notes** inkl. robuster Hash-Bildung
zur Änderungserkennung. Der vollständige Body wird unter ``fulltext`` persistiert;
der Pfad ist relativ (für verlustfreien Export).
Wichtig
-------
- **Nur Inhalte** fließen in den Hash ein (keine FS-Zeitstempel).
- Vergleichsarten:
* Body nur Body
* Frontmatter nur Frontmatter
* Full Body + Frontmatter
per CLI/ENV: ``--hash-mode`` oder ENV ``MINDNET_HASH_MODE``/``MINDNET_HASH_COMPARE``.
- Hash-Quelle:
* parsed (Default) nutzt den vom Parser gelieferten Body
* raw liest Rohdatei und entfernt Frontmatter via Regex
per CLI/ENV: ``--hash-source`` oder ENV ``MINDNET_HASH_SOURCE``.
- Normalisierung:
* canonical (Default) \r\n\n, Zeilenend-Whitespace entfernt
* none keine Normalisierung (erkennt jede Kleinigkeit)
per CLI/ENV: ``--hash-normalize`` oder ENV ``MINDNET_HASH_NORMALIZE``.
Neu in v1.6.0
-------------
- ``hash_signature`` im Payload, z. B. "body:raw:none:<hex>".
- Optionales Mitspeichern eines "Hash-Sets", wenn ENV ``MINDNET_HASH_RECORD_ALL=true``:
payload["hashes"] = {
"body_parsed": "...", "body_raw": "...",
"frontmatter": "...",
"full_parsed": "...", "full_raw": "..."
}
CLI (Sichtprüfung)
------------------
python3 -m app.core.note_payload --from-file ./vault/demo.md --vault-root ./vault --print --hash-mode full --hash-source raw --hash-normalize none
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
from typing import Any, Dict, Optional, Tuple
try:
from app.core.parser import read_markdown, extract_wikilinks, FRONTMATTER_RE
except Exception: # pragma: no cover
from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _canon_frontmatter(fm: Dict[str, Any]) -> str:
"""Kanonische, stabile JSON-Serialisierung der Frontmatter für Hashbildung."""
return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
def _normalize_body(body: str, mode: str) -> str:
"""Normalisiert den Body für reproduzierbare Hashes (oder nicht)."""
if mode == "none":
return body if body is not None else ""
# canonical: \r\n\n, trailing spaces am Zeilenende entfernen
text = (body or "").replace("\r\n", "\n").replace("\r", "\n")
text = "\n".join(line.rstrip() for line in text.split("\n"))
return text
def _resolve_hash_mode(explicit: Optional[str]) -> str:
"""
Normalisiert den Hash-Modus auf:
'body' | 'frontmatter' | 'body+frontmatter'
Akzeptiert auch 'full' (Alias).
Beachtet zusätzlich ENV: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE.
"""
if explicit:
val = explicit.strip().lower()
else:
val = (os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower()
if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"):
return "body+frontmatter"
if val in ("frontmatter", "fm"):
return "frontmatter"
return "body"
def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]:
"""Liest die Rohdatei und extrahiert Body & Frontmatter ohne Parser-Logik.
Rückgabe:
(body_text, frontmatter_dict)
"""
if not file_path or not os.path.exists(file_path):
return "", {}
try:
with open(file_path, "r", encoding="utf-8") as f:
raw = f.read()
except Exception:
return "", {}
# Frontmatter per Regex entfernen
m = FRONTMATTER_RE.match(raw)
fm = {}
if m:
fm_txt = m.group(1)
try:
import yaml # lazy
fm = yaml.safe_load(fm_txt) or {}
except Exception:
fm = {}
body = raw[m.end():]
else:
body = raw
return body, fm
# ---------------------------------------------------------------------------
# Hashing
# ---------------------------------------------------------------------------
def _sha256(s: str) -> str:
h = hashlib.sha256()
h.update(s.encode("utf-8"))
return h.hexdigest()
def compute_hash(*, body: Optional[str], frontmatter: Optional[Dict[str, Any]],
mode: Optional[str] = None, normalize: Optional[str] = None) -> str:
"""
Berechnet einen Hex-Hash gemäß ``mode`` und ``normalize``.
mode: "body" | "frontmatter" | "body+frontmatter"
normalize: "canonical" | "none"
"""
mode = _resolve_hash_mode(mode)
normalize = (normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower()
body_n = _normalize_body(body or "", normalize)
fm_s = _canon_frontmatter(frontmatter or {})
if mode == "frontmatter":
return _sha256(fm_s)
if mode == "body+frontmatter":
return _sha256(body_n + "\n--FM--\n" + fm_s)
# default: body
return _sha256(body_n)
def compute_hash_set(*, body_parsed: str, body_raw: str, fm: Dict[str, Any], normalize: str) -> Dict[str, str]:
"""Berechnet ein Set an Hashes für Monitoring/Debug."""
fm_s = _canon_frontmatter(fm or {})
bp = _normalize_body(body_parsed or "", normalize)
br = _normalize_body(body_raw or "", normalize)
return {
"frontmatter": _sha256(fm_s),
"body_parsed": _sha256(bp),
"body_raw": _sha256(br),
"full_parsed": _sha256(bp + "\n--FM--\n" + fm_s),
"full_raw": _sha256(br + "\n--FM--\n" + fm_s),
}
# ---------------------------------------------------------------------------
# Kernfunktion
# ---------------------------------------------------------------------------
def make_note_payload(
parsed: Any,
vault_root: Optional[str] = None,
*,
hash_mode: Optional[str] = None,
hash_normalize: Optional[str] = None,
hash_source: Optional[str] = None,
file_path: Optional[str] = None,
) -> Dict[str, Any]:
"""
Erzeugt den Payload für eine geparste Note.
Parameters
----------
parsed : Any
Objekt mit Attributen/Keys ``frontmatter``, ``body``, ``path``.
vault_root : Optional[str]
Vault-Wurzel (für Pfad-Relativierung).
hash_mode : Optional[str]
"body" | "frontmatter" | "body+frontmatter" | "full" (Alias; überschreibt ENV).
hash_normalize : Optional[str]
"canonical" | "none" (überschreibt ENV).
hash_source : Optional[str]
"parsed" (Default) oder "raw". Wenn "raw", wird der Body aus der Rohdatei gelesen.
file_path : Optional[str]
Pfad zur Markdown-Datei, erforderlich für ``hash_source=raw``.
Returns
-------
Dict[str, Any]
Qdrant-Payload für die Notes-Collection.
"""
# "Duck typing": dict oder Objekt akzeptieren
if isinstance(parsed, dict):
fm = parsed.get("frontmatter") or {}
body_parsed = parsed.get("body") or ""
path = parsed.get("path") or ""
else:
fm = getattr(parsed, "frontmatter", {}) or {}
body_parsed = getattr(parsed, "body", "") or ""
path = getattr(parsed, "path", "") or ""
# Hash-Quelle bestimmen
src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower()
raw_body, raw_fm = ("", {})
if src == "raw":
raw_body, raw_fm = _read_raw_body_from_file(file_path or path)
# Roh-FM ergänzen (nicht überschreiben)
if isinstance(raw_fm, dict) and raw_fm:
merged_fm = dict(fm)
for k, v in raw_fm.items():
merged_fm.setdefault(k, v)
fm = merged_fm
normalize = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower()
mode_resolved = _resolve_hash_mode(hash_mode)
# Hash gemäß Modus/Quelle bilden
body_for_hash = raw_body if src == "raw" else body_parsed
primary_hash = compute_hash(body=body_for_hash, frontmatter=fm, mode=mode_resolved, normalize=normalize)
hash_signature = f"{'full' if mode_resolved=='body+frontmatter' else mode_resolved}:{src}:{normalize}:{primary_hash}"
# Pfad relativieren
rel_path = path
try:
if vault_root:
rel = os.path.relpath(path, vault_root)
rel = rel.replace("\\", "/").lstrip("/") # normalisieren
rel_path = rel
except Exception:
pass
# Note-Level-Wikilinks (Fallback, wenn Chunks nicht geliefert werden)
note_level_refs = list(dict.fromkeys(extract_wikilinks(body_parsed))) if body_parsed else []
payload: Dict[str, Any] = {
"note_id": fm.get("id") or fm.get("note_id"),
"title": fm.get("title"),
"type": fm.get("type"),
"status": fm.get("status"),
"created": fm.get("created"),
"updated": fm.get("updated"),
"path": rel_path or fm.get("path"),
"tags": fm.get("tags"),
# Primärer Hash + Signatur (für Vergleich)
"hash_fulltext": primary_hash,
"hash_signature": hash_signature,
# Volltext persistieren (verlustfreie Rekonstruktion) parsed Body
"fulltext": body_parsed,
# Fallback-Refs auf Note-Ebene
"references": note_level_refs,
}
for k in ("area", "project", "source", "lang", "slug", "aliases"):
if k in fm:
payload[k] = fm[k]
# Optional: gesamtes Hash-Set persistieren (Debug/Monitoring)
if os.environ.get("MINDNET_HASH_RECORD_ALL", "false").strip().lower() == "true":
payload["hashes"] = compute_hash_set(body_parsed=body_parsed, body_raw=raw_body, fm=fm, normalize=normalize)
# Optional: Roh-Body-Hash separat (historische Kompatibilität)
if os.environ.get("MINDNET_HASH_STORE_RAW", "false").strip().lower() == "true" and src == "raw":
try:
payload["hash_raw_body"] = compute_hash(body=raw_body, frontmatter=fm, mode="body", normalize="none")
except Exception:
pass
return payload
# ---------------------------------------------------------------------------
# CLI Sichtprüfung
# ---------------------------------------------------------------------------
def _cli() -> None:
ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen")
ap.add_argument("--from-file", dest="src", required=True, help="Pfad zur Markdown-Datei")
ap.add_argument("--vault-root", dest="vault_root", default=None, help="Vault-Wurzel zur Pfad-Relativierung")
ap.add_argument("--print", dest="do_print", action="store_true", help="Payload auf stdout ausgeben")
ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter", "full"], default=None)
ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None)
args = ap.parse_args()
parsed = read_markdown(args.src)
payload = make_note_payload(parsed, vault_root=args.vault_root,
hash_mode=args.hash_mode, hash_normalize=args.hash_normalize,
hash_source=args.hash_source, file_path=args.src)
if args.do_print:
print(json.dumps(payload, ensure_ascii=False, indent=2))
if __name__ == "__main__": # pragma: no cover
_cli()
Idempotenz wird der vollständige Body unter ``fulltext`` persistiert und
der Dateipfad relativ zum Vault gespeichert. Das erlaubt eine verlustfreie
Rekonstruktion im Export (erst ``fulltext``, sonst Chunks).
Wichtig
-------
- **Nur Inhalte** gehen in den Hash: Weder Dateisystem-Zeitstempel (mtime/ctime)
noch sonstige FS-Metadaten werden berücksichtigt.
- Hash-Quelle: Parser-Body (Default) oder Rohdatei-Body (Frontmatter via Regex entfernt).
Änderungen in v1.5.1
--------------------
- Neue Env-Var **MINDNET_HASH_COMPARE** als Synonym zu MINDNET_HASH_MODE.
- Akzeptiert komfortable Werte (case-insensitive): **Body**, **Frontmatter**, **Full**.
*Full* wird intern zu ``body+frontmatter`` normalisiert.
- CLI akzeptiert weiterhin ``--hash-mode``; Importer reicht diese Einstellung durch.
Hash-Steuerung
--------------
- Modus (welche Teile in den Hash einfließen):
* ``body`` (Default)
* ``frontmatter``
* ``body+frontmatter`` (Synonym CLI/ENV: ``full``)
Quelle:
* Funktionsparameter ``hash_mode`` (höchste Priorität)
* Env ``MINDNET_HASH_MODE`` oder **``MINDNET_HASH_COMPARE``** (Fallback)
Normalisierung:
* ``canonical`` (Default) | ``none`` — via Param ``hash_normalize`` oder Env ``MINDNET_HASH_NORMALIZE``
Quelle des Body-Textes:
* ``parsed`` (Default) | ``raw`` — via Param ``hash_source`` oder Env ``MINDNET_HASH_SOURCE``
CLI (Sichtprüfung)
------------------
python3 -m app.core.note_payload --from-file ./vault/demo.md --vault-root ./vault --print --hash-source raw
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
from typing import Any, Dict, Optional, Tuple
try:
from app.core.parser import read_markdown, extract_wikilinks, FRONTMATTER_RE
except Exception: # pragma: no cover
from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _canon_frontmatter(fm: Dict[str, Any]) -> str:
"""Kanonische, stabile JSON-Serialisierung der Frontmatter für Hashbildung."""
return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
def _normalize_body(body: str, mode: str) -> str:
"""Normalisiert den Body für reproduzierbare Hashes (oder nicht)."""
if mode == "none":
return body if body is not None else ""
# canonical: \r\n→\n, trailing spaces entfernen
text = (body or "").replace("\r\n", "\n").replace("\r", "\n")
text = "\n".join(line.rstrip() for line in text.split("\n"))
return text
def _resolve_hash_mode(explicit: Optional[str]) -> str:
"""
Normalisiert den Hash-Modus auf einen der Werte:
'body' | 'frontmatter' | 'body+frontmatter'
Akzeptiert auch 'full' als Alias für 'body+frontmatter'.
Beachtet zusätzlich die Env-Variablen MINDNET_HASH_MODE und MINDNET_HASH_COMPARE.
"""
if explicit:
val = explicit.strip().lower()
else:
val = (os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower()
if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"):
return "body+frontmatter"
if val in ("frontmatter", "fm"):
return "frontmatter"
# default & fallbacks
return "body"
def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]:
"""Liest die Rohdatei und extrahiert Body & Frontmatter ohne Parser-Logik.
Rückgabe:
(body_text, frontmatter_dict)
"""
if not file_path or not os.path.exists(file_path):
return "", {}
try:
with open(file_path, "r", encoding="utf-8") as f:
raw = f.read()
except Exception:
return "", {}
# Frontmatter per Regex entfernen
m = FRONTMATTER_RE.match(raw)
fm = {}
if m:
fm_txt = m.group(1)
try:
import yaml # lazy
fm = yaml.safe_load(fm_txt) or {}
except Exception:
fm = {}
body = raw[m.end():]
else:
body = raw
return body, fm
# ---------------------------------------------------------------------------
# Hashing
# ---------------------------------------------------------------------------
def compute_hash(
*,
body: Optional[str],
frontmatter: Optional[Dict[str, Any]],
mode: Optional[str] = None,
normalize: Optional[str] = None,
) -> str:
"""
Berechnet einen Hex-Hash gemäß ``mode`` und ``normalize``.
mode:
- "body" (Default)
- "frontmatter"
- "body+frontmatter"
normalize:
- "canonical" (Default)
- "none"
"""
mode = _resolve_hash_mode(mode)
normalize = (normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower()
body = _normalize_body(body or "", normalize)
fm_s = _canon_frontmatter(frontmatter or {})
h = hashlib.sha256()
if mode == "frontmatter":
h.update(fm_s.encode("utf-8"))
elif mode == "body+frontmatter":
h.update(body.encode("utf-8"))
h.update(b"\n--FM--\n")
h.update(fm_s.encode("utf-8"))
else: # body
h.update(body.encode("utf-8"))
return h.hexdigest()
# ---------------------------------------------------------------------------
# Kernfunktion
# ---------------------------------------------------------------------------
def make_note_payload(
parsed: Any,
vault_root: Optional[str] = None,
*,
hash_mode: Optional[str] = None,
hash_normalize: Optional[str] = None,
hash_source: Optional[str] = None,
file_path: Optional[str] = None,
) -> Dict[str, Any]:
"""
Erzeugt den Payload für eine geparste Note.
Parameters
----------
parsed : Any
Objekt mit Attributen/Keys ``frontmatter``, ``body``, ``path``.
vault_root : Optional[str]
Vault-Wurzel (für Pfad-Relativierung). Wenn ``None``, wird ``path`` unverändert übernommen.
hash_mode : Optional[str]
"body" | "frontmatter" | "body+frontmatter" | "full" (Alias; überschreibt ENV).
hash_normalize : Optional[str]
"canonical" | "none" (überschreibt ENV).
hash_source : Optional[str]
"parsed" (Default) oder "raw". Wenn "raw", wird der Body aus der Rohdatei gelesen.
file_path : Optional[str]
Pfad zur Markdown-Datei, erforderlich für ``hash_source=raw``.
Returns
-------
Dict[str, Any]
Qdrant-Payload für die Notes-Collection.
"""
# "Duck typing": dict oder Objekt akzeptieren
if isinstance(parsed, dict):
fm = parsed.get("frontmatter") or {}
body = parsed.get("body") or ""
path = parsed.get("path") or ""
else:
fm = getattr(parsed, "frontmatter", {}) or {}
body = getattr(parsed, "body", "") or ""
path = getattr(parsed, "path", "") or ""
# Hash-Quelle bestimmen
src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower()
raw_body, raw_fm = ("", {})
if src == "raw":
raw_body, raw_fm = _read_raw_body_from_file(file_path or path)
# Roh-FM ergänzen (nicht überschreiben)
if isinstance(raw_fm, dict) and raw_fm:
merged_fm = dict(fm)
for k, v in raw_fm.items():
merged_fm.setdefault(k, v)
fm = merged_fm
# Hash gemäß Modus/Quelle bilden
body_for_hash = raw_body if src == "raw" else body
hash_fulltext = compute_hash(body=body_for_hash, frontmatter=fm, mode=hash_mode, normalize=hash_normalize)
# Pfad relativieren
rel_path = path
try:
if vault_root:
rel = os.path.relpath(path, vault_root)
rel = rel.replace("\\", "/").lstrip("/") # normalisieren
rel_path = rel
except Exception:
pass # Pfad ist nicht kritisch für Hash/ID
# Note-Level-Wikilinks (Fallback, wenn Chunks nicht geliefert werden)
note_level_refs = list(dict.fromkeys(extract_wikilinks(body))) if body else []
payload: Dict[str, Any] = {
"note_id": fm.get("id") or fm.get("note_id"),
"title": fm.get("title"),
"type": fm.get("type"),
"status": fm.get("status"),
"created": fm.get("created"),
"updated": fm.get("updated"),
"path": rel_path or fm.get("path"),
"tags": fm.get("tags"),
"hash_fulltext": hash_fulltext,
# Volltext persistieren (verlustfreie Rekonstruktion) aus PARSED-Body
"fulltext": body,
# Fallback-Refs auf Note-Ebene
"references": note_level_refs,
}
for k in ("area", "project", "source", "lang", "slug", "aliases"):
if k in fm:
payload[k] = fm[k]
# Optional: Roh-Body-Hash zusätzlich persistieren
if os.environ.get("MINDNET_HASH_STORE_RAW", "false").strip().lower() == "true" and src == "raw":
try:
payload["hash_raw_body"] = compute_hash(body=raw_body, frontmatter=fm, mode="body", normalize="none")
except Exception:
pass
return payload
# ---------------------------------------------------------------------------
# CLI Sichtprüfung
# ---------------------------------------------------------------------------
def _cli() -> None:
ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen")
ap.add_argument("--from-file", dest="src", required=True, help="Pfad zur Markdown-Datei")
ap.add_argument("--vault-root", dest="vault_root", default=None, help="Vault-Wurzel zur Pfad-Relativierung")
ap.add_argument("--print", dest="do_print", action="store_true", help="Payload auf stdout ausgeben")
ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter", "full"], default=None)
ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None)
args = ap.parse_args()
parsed = read_markdown(args.src)
payload = make_note_payload(parsed, vault_root=args.vault_root,
hash_mode=args.hash_mode, hash_normalize=args.hash_normalize,
hash_source=args.hash_source, file_path=args.src)
if args.do_print:
print(json.dumps(payload, ensure_ascii=False, indent=2))
if __name__ == "__main__": # pragma: no cover
_cli()