All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
247 lines
8.5 KiB
Python
247 lines
8.5 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
# Modul: app/core/note_payload.py
|
||
# Version: 1.8.0
|
||
# Datum: 2025-11-08
|
||
# Änderungen:
|
||
# - 'retriever_weight' (Float; Default via ENV MINDNET_DEFAULT_RETRIEVER_WEIGHT, sonst 1.0) aus Frontmatter in Note-Payload übernommen.
|
||
# - 'chunk_profile' (falls vorhanden) übernommen.
|
||
# - Hash-Logik unverändert, kompatibel zu 1.7.0.
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import hashlib
|
||
import json
|
||
import os
|
||
from typing import Any, Dict, Optional, Tuple
|
||
|
||
try:
|
||
from app.core.parser import read_markdown, extract_wikilinks, FRONTMATTER_RE
|
||
except Exception: # pragma: no cover
|
||
from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _canon_frontmatter(fm: Dict[str, Any]) -> str:
|
||
return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
|
||
|
||
def _normalize_body(body: str, mode: str) -> str:
|
||
if mode == "none":
|
||
return body if body is not None else ""
|
||
text = (body or "").replace("\r\n", "\n").replace("\r", "\n")
|
||
text = "\n".join(line.rstrip() for line in text.split("\n"))
|
||
return text
|
||
|
||
def _resolve_hash_mode(explicit: Optional[str]) -> str:
|
||
if explicit:
|
||
val = explicit.strip().lower()
|
||
else:
|
||
val = (os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower()
|
||
if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"):
|
||
return "full"
|
||
if val in ("frontmatter", "fm"):
|
||
return "frontmatter"
|
||
return "body"
|
||
|
||
def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]:
|
||
if not file_path or not os.path.exists(file_path):
|
||
return "", {}
|
||
try:
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
raw = f.read()
|
||
except Exception:
|
||
return "", {}
|
||
m = FRONTMATTER_RE.match(raw)
|
||
fm = {}
|
||
if m:
|
||
fm_txt = m.group(1)
|
||
try:
|
||
import yaml # lazy
|
||
fm = yaml.safe_load(fm_txt) or {}
|
||
except Exception:
|
||
fm = {}
|
||
body = raw[m.end():]
|
||
else:
|
||
body = raw
|
||
return body, fm
|
||
|
||
def _sha256(s: str) -> str:
|
||
h = hashlib.sha256()
|
||
h.update(s.encode("utf-8"))
|
||
return h.hexdigest()
|
||
|
||
def _hash_for(mode: str, *, body: str, fm: Dict[str, Any], normalize: str) -> str:
|
||
body_n = _normalize_body(body or "", normalize)
|
||
fm_s = _canon_frontmatter(fm or {})
|
||
if mode == "frontmatter":
|
||
return _sha256(fm_s)
|
||
if mode == "full":
|
||
return _sha256(body_n + "\n--FM--\n" + fm_s)
|
||
# default: body
|
||
return _sha256(body_n)
|
||
|
||
def _to_float(val: Any, default: float) -> float:
|
||
try:
|
||
if val is None:
|
||
return float(default)
|
||
if isinstance(val, (int, float)):
|
||
return float(val)
|
||
s = str(val).strip().replace(",", ".")
|
||
return float(s)
|
||
except Exception:
|
||
return float(default)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Kernfunktion
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def make_note_payload(
|
||
parsed: Any,
|
||
vault_root: Optional[str] = None,
|
||
*,
|
||
hash_mode: Optional[str] = None,
|
||
hash_normalize: Optional[str] = None,
|
||
hash_source: Optional[str] = None,
|
||
file_path: Optional[str] = None,
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Liefert den Note-Payload inkl. Mehrfach-Hashes und FM-Feldern.
|
||
"""
|
||
# dict oder Objekt akzeptieren
|
||
if isinstance(parsed, dict):
|
||
fm = parsed.get("frontmatter") or {}
|
||
body_parsed = parsed.get("body") or ""
|
||
path = parsed.get("path") or ""
|
||
else:
|
||
fm = getattr(parsed, "frontmatter", {}) or {}
|
||
body_parsed = getattr(parsed, "body", "") or ""
|
||
path = getattr(parsed, "path", "") or ""
|
||
|
||
# Zielpfad relativieren
|
||
rel_path = path
|
||
try:
|
||
if vault_root:
|
||
rel = os.path.relpath(path, vault_root)
|
||
rel = rel.replace("\\", "/").lstrip("/")
|
||
rel_path = rel
|
||
except Exception:
|
||
pass
|
||
|
||
# Konfiguration auflösen
|
||
mode_resolved = _resolve_hash_mode(hash_mode) # body|frontmatter|full
|
||
src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower() # parsed|raw
|
||
norm = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() # canonical|none
|
||
|
||
# Body-Quelle laden
|
||
raw_body, raw_fm = ("", {})
|
||
if src == "raw":
|
||
raw_body, raw_fm = _read_raw_body_from_file(file_path or path)
|
||
if isinstance(raw_fm, dict) and raw_fm:
|
||
merged_fm = dict(fm)
|
||
for k, v in raw_fm.items():
|
||
merged_fm.setdefault(k, v)
|
||
fm = merged_fm
|
||
body_for_hash = raw_body
|
||
else:
|
||
body_for_hash = body_parsed
|
||
|
||
# --- 1) Standard-Tripel (parsed:canonical) immer erzeugen ---
|
||
std_src = "parsed"
|
||
std_norm = "canonical"
|
||
std_hashes: Dict[str, str] = {}
|
||
for m in ("body", "frontmatter", "full"):
|
||
std_hashes[f"{m}:{std_src}:{std_norm}"] = _hash_for(
|
||
m, body=body_parsed, fm=fm, normalize=std_norm
|
||
)
|
||
|
||
# Convenience-Felder (für Tools)
|
||
hash_body = std_hashes["body:parsed:canonical"]
|
||
hash_frontmatter = std_hashes["frontmatter:parsed:canonical"]
|
||
hash_full = std_hashes["full:parsed:canonical"]
|
||
|
||
# --- 2) Hashes für die *aktuelle* Konfiguration (falls abweichend) ---
|
||
cur_hashes: Dict[str, str] = {}
|
||
if not (src == std_src and norm == std_norm):
|
||
for m in ("body", "frontmatter", "full"):
|
||
cur_hashes[f"{m}:{src}:{norm}"] = _hash_for(
|
||
m, body=body_for_hash, fm=fm, normalize=norm
|
||
)
|
||
|
||
# --- 3) Aktueller Modus für Backwards-Compat Felder ---
|
||
current_hash = _hash_for(mode_resolved, body=body_for_hash, fm=fm, normalize=norm)
|
||
hash_signature = f"{mode_resolved}:{src}:{norm}:{current_hash}"
|
||
|
||
# Wikilinks (Note-Ebene)
|
||
refs = list(dict.fromkeys(extract_wikilinks(body_parsed))) if body_parsed else []
|
||
|
||
# NEU: Defaults & Casting
|
||
default_rw = _to_float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0), 1.0)
|
||
fm_rw = _to_float(fm.get("retriever_weight"), default_rw)
|
||
fm_chunk_profile = fm.get("chunk_profile") or fm.get("profile") or None
|
||
|
||
payload: Dict[str, Any] = {
|
||
"note_id": fm.get("id") or fm.get("note_id"),
|
||
"title": fm.get("title"),
|
||
"type": fm.get("type"),
|
||
"status": fm.get("status"),
|
||
"created": fm.get("created"),
|
||
"updated": fm.get("updated"),
|
||
"path": rel_path or fm.get("path"),
|
||
"tags": fm.get("tags"),
|
||
# Volltext für verlustfreien Export
|
||
"fulltext": body_parsed,
|
||
# Backwards-Compat:
|
||
"hash_fulltext": current_hash,
|
||
"hash_signature": hash_signature,
|
||
# Option C: Mehrfach-Hashes
|
||
"hashes": {**std_hashes, **cur_hashes},
|
||
"hash_body": hash_body,
|
||
"hash_frontmatter": hash_frontmatter,
|
||
"hash_full": hash_full,
|
||
# Fallback-Refs
|
||
"references": refs,
|
||
# NEU:
|
||
"retriever_weight": fm_rw,
|
||
}
|
||
|
||
if fm_chunk_profile is not None:
|
||
payload["chunk_profile"] = str(fm_chunk_profile)
|
||
|
||
for k in ("area", "project", "source", "lang", "slug", "aliases"):
|
||
if k in fm:
|
||
payload[k] = fm[k]
|
||
|
||
return payload
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI – Sichtprüfung
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _cli() -> None:
|
||
ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen")
|
||
ap.add_argument("--from-file", dest="src", required=True)
|
||
ap.add_argument("--vault-root", dest="vault_root", default=None)
|
||
ap.add_argument("--print", dest="do_print", action="store_true")
|
||
ap.add_argument("--hash-mode", choices=["body", "frontmatter", "full"], default=None)
|
||
ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
|
||
ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None)
|
||
args = ap.parse_args()
|
||
|
||
parsed = read_markdown(args.src)
|
||
payload = make_note_payload(
|
||
parsed,
|
||
vault_root=args.vault_root,
|
||
hash_mode=args.hash_mode,
|
||
hash_normalize=args.hash_normalize,
|
||
hash_source=args.hash_source,
|
||
file_path=args.src,
|
||
)
|
||
if args.do_print:
|
||
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
||
|
||
if __name__ == "__main__": # pragma: no cover
|
||
_cli()
|