mindnet/app/core/note_payload.py
Lars a7c5630e5b
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
app/core/note_payload.py aktualisiert
2025-11-08 21:48:01 +01:00

266 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Modul: app/core/note_payload.py
# Version: 1.7.0
# Datum: 2025-09-09
from __future__ import annotations
import argparse
import hashlib
import json
import os
from typing import Any, Dict, Optional, Tuple
try:
from app.core.parser import read_markdown, extract_wikilinks, FRONTMATTER_RE
except Exception: # pragma: no cover
from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _canon_frontmatter(fm: Dict[str, Any]) -> str:
return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
def _normalize_body(body: str, mode: str) -> str:
if mode == "none":
return body if body is not None else ""
text = (body or "").replace("\r\n", "\n").replace("\r", "\n")
text = "\n".join(line.rstrip() for line in text.split("\n"))
return text
def _resolve_hash_mode(explicit: Optional[str]) -> str:
if explicit:
val = explicit.strip().lower()
else:
val = (os.environ.get("MINDNET_HASH_MODE")
or os.environ.get("MINDNET_HASH_COMPARE")
or "body").strip().lower()
if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"):
return "full"
if val in ("frontmatter", "fm"):
return "frontmatter"
return "body"
def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]:
if not file_path or not os.path.exists(file_path):
return "", {}
try:
with open(file_path, "r", encoding="utf-8") as f:
raw = f.read()
except Exception:
return "", {}
m = FRONTMATTER_RE.match(raw)
fm = {}
if m:
fm_txt = m.group(1)
try:
import yaml # lazy
fm = yaml.safe_load(fm_txt) or {}
except Exception:
fm = {}
body = raw[m.end():]
else:
body = raw
return body, fm
def _sha256(s: str) -> str:
h = hashlib.sha256()
h.update(s.encode("utf-8"))
return h.hexdigest()
def _hash_for(mode: str, *, body: str, fm: Dict[str, Any], normalize: str) -> str:
body_n = _normalize_body(body or "", normalize)
fm_s = _canon_frontmatter(fm or {})
if mode == "frontmatter":
return _sha256(fm_s)
if mode == "full":
return _sha256(body_n + "\n--FM--\n" + fm_s)
# default: body
return _sha256(body_n)
# ---------------------------------------------------------------------------
# Kernfunktion
# ---------------------------------------------------------------------------
def make_note_payload(
parsed: Any,
vault_root: Optional[str] = None,
*,
hash_mode: Optional[str] = None,
hash_normalize: Optional[str] = None,
hash_source: Optional[str] = None,
file_path: Optional[str] = None,
) -> Dict[str, Any]:
"""
Liefert den Note-Payload inkl. Mehrfach-Hashes.
- Es werden IMMER die drei Hashes für (body|frontmatter|full) unter
'parsed:canonical' erzeugt (Schlüssel: z. B. 'body:parsed:canonical').
- Zusätzlich werden falls die aktuelle Konfig (source/normalize) davon
abweicht die drei Hashes unter den entsprechenden Schlüsseln erzeugt,
z. B. 'frontmatter:raw:none'.
- 'hash_fulltext' und 'hash_signature' repräsentieren den *aktuellen* Modus.
"""
# dict oder Objekt akzeptieren
if isinstance(parsed, dict):
fm = parsed.get("frontmatter") or {}
body_parsed = parsed.get("body") or ""
path = parsed.get("path") or ""
else:
fm = getattr(parsed, "frontmatter", {}) or {}
body_parsed = getattr(parsed, "body", "") or ""
path = getattr(parsed, "path", "") or ""
# Zielpfad relativieren
rel_path = path
try:
if vault_root:
rel = os.path.relpath(path, vault_root)
rel = rel.replace("\\", "/").lstrip("/")
rel_path = rel
except Exception:
pass
# Konfiguration auflösen
mode_resolved = _resolve_hash_mode(hash_mode) # body|frontmatter|full
src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower() # parsed|raw
norm = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() # canonical|none
# Body-Quelle laden
raw_body, raw_fm = ("", {})
if src == "raw":
raw_body, raw_fm = _read_raw_body_from_file(file_path or path)
if isinstance(raw_fm, dict) and raw_fm:
merged_fm = dict(fm)
for k, v in raw_fm.items():
merged_fm.setdefault(k, v)
fm = merged_fm
body_for_hash = raw_body
else:
body_for_hash = body_parsed
# --- 1) Standard-Tripel (parsed:canonical) immer erzeugen ---
std_src = "parsed"
std_norm = "canonical"
std_hashes: Dict[str, str] = {}
for m in ("body", "frontmatter", "full"):
std_hashes[f"{m}:{std_src}:{std_norm}"] = _hash_for(
m, body=body_parsed, fm=fm, normalize=std_norm
)
# Convenience-Felder (für Tools)
hash_body = std_hashes["body:parsed:canonical"]
hash_frontmatter = std_hashes["frontmatter:parsed:canonical"]
hash_full = std_hashes["full:parsed:canonical"]
# --- 2) Hashes für die *aktuelle* Konfiguration (falls abweichend) ---
cur_hashes: Dict[str, str] = {}
if not (src == std_src and norm == std_norm):
for m in ("body", "frontmatter", "full"):
cur_hashes[f"{m}:{src}:{norm}"] = _hash_for(
m, body=body_for_hash, fm=fm, normalize=norm
)
# --- 3) Aktueller Modus für Backwards-Compat Felder ---
current_hash = _hash_for(mode_resolved, body=body_for_hash, fm=fm, normalize=norm)
hash_signature = f"{mode_resolved}:{src}:{norm}:{current_hash}"
# Wikilinks (Note-Ebene)
refs = list(dict.fromkeys(extract_wikilinks(body_parsed))) if body_parsed else []
payload: Dict[str, Any] = {
"note_id": fm.get("id") or fm.get("note_id"),
"title": fm.get("title"),
"type": fm.get("type"),
"status": fm.get("status"),
"created": fm.get("created"),
"updated": fm.get("updated"),
"path": rel_path or fm.get("path"),
"tags": fm.get("tags"),
# Volltext für verlustfreien Export
"fulltext": body_parsed,
# Backwards-Compat:
"hash_fulltext": current_hash,
"hash_signature": hash_signature,
# Option C: Mehrfach-Hashes
"hashes": {**std_hashes, **cur_hashes},
"hash_body": hash_body,
"hash_frontmatter": hash_frontmatter,
"hash_full": hash_full,
# Fallback-Refs
"references": refs,
}
for k in ("area", "project", "source", "lang", "slug", "aliases"):
if k in fm:
payload[k] = fm[k]
# --- MINIMAL PATCH: retriever_weight in Note-Payload injizieren (ohne Seiteneffekte) ---
# Annahmen: Variablen `payload`, `parsed_note`, `retriever_weight`, `type_defaults` existieren bereits
# und `payload` enthält die bisherigen Felder wie gehabt.
# Frontmatter defensiv holen, ohne Struktur zu verändern:
fm = {}
try:
fm = getattr(parsed_note, "frontmatter", {}) or {}
except Exception:
pass
if not isinstance(fm, dict):
fm = {}
# Note-Typ möglichst aus Frontmatter oder parsed_note lesen, ohne bestehende Logik zu beeinflussen:
note_type = fm.get("type")
if not note_type:
note_type = getattr(parsed_note, "type", None)
# Wertkaskade: Frontmatter > type_defaults > Funktionsargument > (kein Fallback: wir setzen nur, wenn vorhanden)
rw_val = None
if "retriever_weight" in fm:
rw_val = fm["retriever_weight"]
elif type_defaults and note_type in type_defaults and isinstance(type_defaults[note_type], dict):
if "retriever_weight" in type_defaults[note_type]:
rw_val = type_defaults[note_type]["retriever_weight"]
elif retriever_weight is not None:
rw_val = retriever_weight
# Nur setzen, wenn ein Wert vorhanden ist und robust nach float wandeln:
if rw_val is not None:
try:
payload["retriever_weight"] = float(str(rw_val).replace(",", "."))
except Exception:
# Keine Havarie riskieren wenn nicht konvertierbar, nicht setzen.
pass
# --- END MINIMAL PATCH ---
return payload
# ---------------------------------------------------------------------------
# CLI Sichtprüfung
# ---------------------------------------------------------------------------
def _cli() -> None:
ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen")
ap.add_argument("--from-file", dest="src", required=True)
ap.add_argument("--vault-root", dest="vault_root", default=None)
ap.add_argument("--print", dest="do_print", action="store_true")
ap.add_argument("--hash-mode", choices=["body", "frontmatter", "full"], default=None)
ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None)
args = ap.parse_args()
parsed = read_markdown(args.src)
payload = make_note_payload(
parsed,
vault_root=args.vault_root,
hash_mode=args.hash_mode,
hash_normalize=args.hash_normalize,
hash_source=args.hash_source,
file_path=args.src,
)
if args.do_print:
print(json.dumps(payload, ensure_ascii=False, indent=2))
if __name__ == "__main__": # pragma: no cover
_cli()