169 lines
5.7 KiB
Python
169 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Modul: app/core/note_payload.py
|
||
Version: 1.3.1
|
||
Datum: 2025-09-09
|
||
|
||
Kurzbeschreibung
|
||
----------------
|
||
Erzeugt den Qdrant-Payload für **Notes**. Neben stabiler Hash-Bildung zur
|
||
Idempotenz wird der vollständige Body unter ``fulltext`` persistiert und
|
||
der Dateipfad relativ zum Vault gespeichert. Das erlaubt eine verlustfreie
|
||
Rekonstruktion im Export (erst ``fulltext``, sonst Chunks).
|
||
|
||
Wesentliche Features
|
||
--------------------
|
||
- Hash-Strategie via ENV ``MINDNET_HASH_MODE``:
|
||
* ``body`` (Default)
|
||
* ``frontmatter``
|
||
* ``body+frontmatter``
|
||
- Persistenter Volltext im Note-Payload: ``fulltext``
|
||
- Pfad-Relativierung (``path``) gegen ``vault_root``
|
||
- Optionale Note-Level-Wikilinks (Fallback-Refs)
|
||
|
||
Beispiele (CLI – Sichtprüfung)
|
||
------------------------------
|
||
python3 -m app.core.note_payload --from-file ./vault/demo.md --vault-root ./vault --print
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
import argparse
|
||
import hashlib
|
||
import json
|
||
import os
|
||
from typing import Any, Dict, Optional
|
||
|
||
try:
|
||
# In deinem Parser heißen die Funktionen read_markdown / extract_wikilinks
|
||
from app.core.parser import read_markdown, extract_wikilinks
|
||
except Exception: # pragma: no cover
|
||
from .parser import read_markdown, extract_wikilinks # type: ignore
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Hashing
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _canon_frontmatter(fm: Dict[str, Any]) -> str:
|
||
"""Kanonische, stabile JSON-Serialisierung der Frontmatter für Hashbildung."""
|
||
return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
|
||
|
||
|
||
def compute_hash(*, body: Optional[str], frontmatter: Optional[Dict[str, Any]], mode: Optional[str] = None) -> str:
|
||
"""
|
||
Berechnet einen Hex-Hash gemäß ``mode``.
|
||
|
||
mode:
|
||
- "body" (Default)
|
||
- "frontmatter"
|
||
- "body+frontmatter"
|
||
"""
|
||
mode = (mode or os.environ.get("MINDNET_HASH_MODE", "body")).strip().lower()
|
||
body = body or ""
|
||
fm_s = _canon_frontmatter(frontmatter or {})
|
||
|
||
h = hashlib.sha256()
|
||
if mode == "frontmatter":
|
||
h.update(fm_s.encode("utf-8"))
|
||
elif mode == "body+frontmatter":
|
||
h.update(body.encode("utf-8"))
|
||
h.update(b"\n--FM--\n")
|
||
h.update(fm_s.encode("utf-8"))
|
||
else: # body
|
||
h.update(body.encode("utf-8"))
|
||
return h.hexdigest()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Kernfunktion
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def make_note_payload(parsed: Any, vault_root: Optional[str] = None) -> Dict[str, Any]:
|
||
"""
|
||
Erzeugt den Payload für eine geparste Note.
|
||
|
||
Parameters
|
||
----------
|
||
parsed : Any
|
||
Objekt mit Attributen/Keys ``frontmatter``, ``body``, ``path``.
|
||
vault_root : Optional[str]
|
||
Vault-Wurzel (für Pfad-Relativierung). Wenn ``None``, wird ``path`` unverändert übernommen.
|
||
|
||
Returns
|
||
-------
|
||
Dict[str, Any]
|
||
Qdrant-Payload für die Notes-Collection.
|
||
"""
|
||
# "Duck typing": dict oder Objekt akzeptieren
|
||
fm = (
|
||
getattr(parsed, "frontmatter", None)
|
||
or getattr(parsed, "fm", None)
|
||
or getattr(parsed, "front_matter", None)
|
||
or (parsed.get("frontmatter") if isinstance(parsed, dict) else {})
|
||
) or {}
|
||
body = getattr(parsed, "body", None) or (parsed.get("body") if isinstance(parsed, dict) else "") or ""
|
||
path = getattr(parsed, "path", None) or (parsed.get("path") if isinstance(parsed, dict) else "") or ""
|
||
|
||
# Hash gem. Modus bilden (Default: body)
|
||
hash_fulltext = compute_hash(body=body, frontmatter=fm, mode=None)
|
||
|
||
# Pfad relativieren
|
||
rel_path = path
|
||
try:
|
||
if vault_root:
|
||
rel = os.path.relpath(path, vault_root)
|
||
rel = rel.replace("\\", "/").lstrip("/") # normalisieren
|
||
rel_path = rel
|
||
except Exception:
|
||
# fail-safe, Pfad ist nicht kritisch für Hash/ID
|
||
pass
|
||
|
||
# Optionale Note-Level-Wikilinks (Fallback, wenn Chunks nicht geliefert werden)
|
||
note_level_refs = list(dict.fromkeys(extract_wikilinks(body))) if body else []
|
||
|
||
payload: Dict[str, Any] = {
|
||
"note_id": fm.get("id") or fm.get("note_id"),
|
||
"title": fm.get("title"),
|
||
"type": fm.get("type"),
|
||
"status": fm.get("status"),
|
||
"created": fm.get("created"),
|
||
"updated": fm.get("updated"),
|
||
"path": rel_path or fm.get("path"),
|
||
"tags": fm.get("tags"),
|
||
"hash_fulltext": hash_fulltext,
|
||
# --- WICHTIG: Volltext persistieren ---
|
||
"fulltext": body,
|
||
# --- Optionaler Fallback für Edge-Ableitung ---
|
||
"references": note_level_refs,
|
||
}
|
||
|
||
# Bekannte optionale Frontmatter-Felder durchreichen
|
||
for k in ("area", "project", "source", "lang", "slug"):
|
||
if k in fm:
|
||
payload[k] = fm[k]
|
||
|
||
return payload
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI – Sichtprüfung
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _cli() -> None:
|
||
ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen")
|
||
ap.add_argument("--from-file", dest="src", required=True, help="Pfad zur Markdown-Datei")
|
||
ap.add_argument("--vault-root", dest="vault_root", default=None, help="Vault-Wurzel zur Pfad-Relativierung")
|
||
ap.add_argument("--print", dest="do_print", action="store_true", help="Payload auf stdout ausgeben")
|
||
args = ap.parse_args()
|
||
|
||
parsed = read_markdown(args.src)
|
||
payload = make_note_payload(parsed, vault_root=args.vault_root)
|
||
|
||
if args.do_print:
|
||
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
||
|
||
|
||
if __name__ == "__main__": # pragma: no cover
|
||
_cli()
|