app/core/note_payload.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
This commit is contained in:
parent
22885a4857
commit
6c25d76135
|
|
@ -1,212 +1,164 @@
|
||||||
# app/core/note_payload.py
|
#!/usr/bin/env python3
|
||||||
# -----------------------------------------------------------------------------
|
# -*- coding: utf-8 -*-
|
||||||
# Name: note_payload.py
|
"""
|
||||||
# Version: 1.2.1 (2025-09-08)
|
Modul: app/core/note_payload.py
|
||||||
# Zweck: Erzeugt den Qdrant-Payload für Notes, inkl. deterministischer
|
Version: 1.3.0
|
||||||
# Hash-Bildung zur Idempotenz-Erkennung.
|
Datum: 2025-09-09
|
||||||
#
|
|
||||||
# Änderungen:
|
Kurzbeschreibung
|
||||||
# 1.2.1: Akzeptiert jetzt sowohl dict-Input als auch Objekt-Input (z. B. ParsedNote)
|
----------------
|
||||||
# mit Attributen .frontmatter, .body, .path. Dadurch kein AttributeError mehr.
|
Erzeugt den Qdrant-Payload für **Notes**. Neben stabiler Hash-Bildung zur
|
||||||
# 1.2.0: Konfigurierbare Hash-Strategie via ENV MINDNET_HASH_MODE
|
Idempotenz wird der vollständige Body unter ``fulltext`` persistiert und
|
||||||
# ('body' | 'body+frontmatter' | 'frontmatter'); kanonische FM-Serialisierung.
|
der Dateipfad relativ zum Vault gespeichert. Das erlaubt eine verlustfreie
|
||||||
#
|
Rekonstruktion im Export (erst ``fulltext``, sonst Chunks).
|
||||||
# Steuerung Hash-Strategie (unverändert):
|
|
||||||
# export MINDNET_HASH_MODE=body+frontmatter
|
Wesentliche Features
|
||||||
# MINDNET_HASH_MODE=frontmatter python3 -m scripts.import_markdown --vault ./vault --apply
|
--------------------
|
||||||
#
|
- Hash-Strategie via ENV ``MINDNET_HASH_MODE``:
|
||||||
# Hinweis:
|
* ``body`` (Default)
|
||||||
# - Datei-Zeitstempel (mtime/ctime) werden NICHT verwendet.
|
* ``frontmatter``
|
||||||
# - Default-Strategie bleibt 'body' (rückwärtskompatibel).
|
* ``body+frontmatter``
|
||||||
# -----------------------------------------------------------------------------
|
- Persistenter Volltext im Note-Payload: ``fulltext``
|
||||||
|
- Pfad-Relativierung (``path``) gegen ``vault_root``
|
||||||
|
- Optionale Note-Level-Wikilinks (Fallback-Refs)
|
||||||
|
|
||||||
|
Beispiele (CLI – Sichtprüfung)
|
||||||
|
------------------------------
|
||||||
|
python3 -m app.core.note_payload --from-file ./vault/demo.md --vault-root ./vault --print
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import argparse
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
try:
|
||||||
# -----------------------------------------------------------------------------#
|
from app.core.parser import parse_markdown, extract_wikilinks
|
||||||
# Dienstfunktionen
|
except Exception: # pragma: no cover
|
||||||
# -----------------------------------------------------------------------------#
|
from .parser import parse_markdown, extract_wikilinks # type: ignore
|
||||||
|
|
||||||
def sha256_text(s: str) -> str:
|
|
||||||
"""Bildet SHA-256 über den gegebenen Unicode-String (UTF-8)."""
|
|
||||||
return hashlib.sha256(s.encode("utf-8")).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def canonicalize_frontmatter(fm: Dict[str, Any]) -> str:
|
# ---------------------------------------------------------------------------
|
||||||
|
# Hashing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _canon_frontmatter(fm: Dict[str, Any]) -> str:
|
||||||
|
"""Kanonische, stabile JSON-Serialisierung der Frontmatter für Hashbildung."""
|
||||||
|
# Keys sortieren, ASCII nicht erzwingen, um Umlaute stabil zu halten
|
||||||
|
return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_hash(*, body: Optional[str], frontmatter: Optional[Dict[str, Any]], mode: Optional[str] = None) -> str:
|
||||||
"""
|
"""
|
||||||
Serialisiert das Frontmatter deterministisch:
|
Berechnet einen Hex-Hash gemäß ``mode``.
|
||||||
- JSON mit sortierten Keys
|
|
||||||
- kompakte Separatoren
|
mode:
|
||||||
- UTF-8, keine ASCII-Escapes
|
- "body" (Default)
|
||||||
Achtung: Datumswerte müssen Strings sein (siehe Schema).
|
- "frontmatter"
|
||||||
|
- "body+frontmatter"
|
||||||
"""
|
"""
|
||||||
return json.dumps(
|
if mode is None:
|
||||||
fm or {},
|
mode = os.environ.get("MINDNET_HASH_MODE", "body").strip().lower()
|
||||||
ensure_ascii=False,
|
body = body or ""
|
||||||
sort_keys=True,
|
fm_s = _canon_frontmatter(frontmatter or {})
|
||||||
separators=(",", ":"),
|
|
||||||
)
|
h = hashlib.sha256()
|
||||||
|
if mode == "frontmatter":
|
||||||
|
h.update(fm_s.encode("utf-8"))
|
||||||
|
elif mode == "body+frontmatter":
|
||||||
|
h.update(body.encode("utf-8"))
|
||||||
|
h.update(b"\n--FM--\n")
|
||||||
|
h.update(fm_s.encode("utf-8"))
|
||||||
|
else: # body
|
||||||
|
h.update(body.encode("utf-8"))
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def get_hash_mode_from_env() -> str:
|
# ---------------------------------------------------------------------------
|
||||||
"""
|
# Kernfunktion
|
||||||
Liest die Hash-Strategie aus ENV MINDNET_HASH_MODE.
|
# ---------------------------------------------------------------------------
|
||||||
Zulässig: 'body' (Default), 'body+frontmatter', 'frontmatter'
|
|
||||||
"""
|
|
||||||
val = (os.environ.get("MINDNET_HASH_MODE") or "").strip().lower()
|
|
||||||
if val in ("body", "body+frontmatter", "frontmatter"):
|
|
||||||
return val
|
|
||||||
return "body"
|
|
||||||
|
|
||||||
|
|
||||||
def compute_hash(body: str, frontmatter: Dict[str, Any], mode: Optional[str] = None) -> str:
|
|
||||||
"""
|
|
||||||
Berechnet den Hash gemäß Strategie.
|
|
||||||
- 'body': nur Body
|
|
||||||
- 'body+frontmatter': Body + FM (kanonisch)
|
|
||||||
- 'frontmatter': nur FM (kanonisch)
|
|
||||||
"""
|
|
||||||
strategy = (mode or get_hash_mode_from_env()).lower()
|
|
||||||
|
|
||||||
body_str = (body or "").strip()
|
|
||||||
fm_str = canonicalize_frontmatter(frontmatter or {})
|
|
||||||
|
|
||||||
if strategy == "frontmatter":
|
|
||||||
return sha256_text(fm_str)
|
|
||||||
|
|
||||||
if strategy == "body+frontmatter":
|
|
||||||
combo = body_str + "\n\n---\n\n" + fm_str
|
|
||||||
return sha256_text(combo)
|
|
||||||
|
|
||||||
# Default / 'body'
|
|
||||||
return sha256_text(body_str)
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------#
|
|
||||||
# Helfer: parsed -> (frontmatter, body, path)
|
|
||||||
# -----------------------------------------------------------------------------#
|
|
||||||
|
|
||||||
def _extract_parsed(parsed: Any) -> tuple[Dict[str, Any], str, Optional[str]]:
|
|
||||||
"""
|
|
||||||
Erlaubt sowohl dict- als auch objektbasierte Parser-Ergebnisse.
|
|
||||||
Erwartet mindestens 'frontmatter' + 'body'. 'path' ist optional.
|
|
||||||
"""
|
|
||||||
# dict-Eingang
|
|
||||||
if isinstance(parsed, dict):
|
|
||||||
fm = dict(parsed.get("frontmatter") or {})
|
|
||||||
body = parsed.get("body") or ""
|
|
||||||
path = parsed.get("path")
|
|
||||||
return fm, body, path
|
|
||||||
|
|
||||||
# objektbasierter Eingang (z. B. ParsedNote)
|
|
||||||
# Erwartete Attribute: .frontmatter (dict), .body (str), optional .path
|
|
||||||
fm = {}
|
|
||||||
if hasattr(parsed, "frontmatter"):
|
|
||||||
fm_val = getattr(parsed, "frontmatter")
|
|
||||||
if isinstance(fm_val, dict):
|
|
||||||
fm = dict(fm_val)
|
|
||||||
else:
|
|
||||||
# Notfalls in ein dict konvertieren, falls FM ein pydantic/BaseModel ist
|
|
||||||
try:
|
|
||||||
fm = dict(fm_val) # type: ignore[arg-type]
|
|
||||||
except Exception:
|
|
||||||
# finaler Fallback: JSON roundtrip
|
|
||||||
fm = json.loads(json.dumps(fm_val, default=getattr(fm_val, "dict", None)))
|
|
||||||
body = getattr(parsed, "body", "") or ""
|
|
||||||
path = getattr(parsed, "path", None)
|
|
||||||
return fm, body, path
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------#
|
|
||||||
# Hauptfunktion für Note-Payload
|
|
||||||
# -----------------------------------------------------------------------------#
|
|
||||||
|
|
||||||
def make_note_payload(parsed: Any, vault_root: Optional[str] = None) -> Dict[str, Any]:
|
def make_note_payload(parsed: Any, vault_root: Optional[str] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Baut den Payload für eine Note auf Basis der geparsten Markdown-Datei.
|
Erzeugt den Payload für eine geparste Note.
|
||||||
|
|
||||||
parsed: dict ODER Objekt mit Attributen .frontmatter, .body, optional .path
|
Parameters
|
||||||
|
----------
|
||||||
|
parsed : Any
|
||||||
|
Objekt mit Attributen/Keys ``frontmatter``, ``body``, ``path``.
|
||||||
|
vault_root : Optional[str]
|
||||||
|
Vault-Wurzel (für Pfad-Relativierung). Wenn ``None``, wird ``path`` unverändert übernommen.
|
||||||
|
|
||||||
Rückgabe-Payload (kompatibel mit mindnet_notes Schema):
|
Returns
|
||||||
{
|
-------
|
||||||
"note_id": "...",
|
Dict[str, Any]
|
||||||
"title": "...",
|
Qdrant-Payload für die Notes-Collection.
|
||||||
"type": "...",
|
|
||||||
"status": "...",
|
|
||||||
"created": "...",
|
|
||||||
"updated": "...",
|
|
||||||
"path": "...", # falls vorhanden
|
|
||||||
"tags": [...], # optional
|
|
||||||
"hash_fulltext": "sha256...",
|
|
||||||
...
|
|
||||||
}
|
|
||||||
"""
|
"""
|
||||||
fm, body, path = _extract_parsed(parsed)
|
# "Duck typing": dict oder Objekt akzeptieren
|
||||||
|
fm = getattr(parsed, "frontmatter", None) or getattr(parsed, "fm", None) or getattr(parsed, "front_matter", None) or parsed.get("frontmatter", {})
|
||||||
|
body = getattr(parsed, "body", None) or parsed.get("body", "")
|
||||||
|
path = getattr(parsed, "path", None) or parsed.get("path", "")
|
||||||
|
|
||||||
# Hash nach konfigurierter Strategie berechnen
|
# Hash gem. Modus bilden (Default: body)
|
||||||
hash_fulltext = compute_hash(body=body, frontmatter=fm, mode=None)
|
hash_fulltext = compute_hash(body=body, frontmatter=fm, mode=None)
|
||||||
|
|
||||||
|
# Pfad relativieren
|
||||||
|
rel_path = path
|
||||||
|
try:
|
||||||
|
if vault_root:
|
||||||
|
rel = os.path.relpath(path, vault_root)
|
||||||
|
rel = rel.replace("\\", "/").lstrip("/") # normalisieren
|
||||||
|
rel_path = rel
|
||||||
|
except Exception:
|
||||||
|
# fail-safe, Pfad ist nicht kritisch für Hash/ID
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Optionale Note-Level-Wikilinks (Fallback, wenn Chunks nicht geliefert werden)
|
||||||
|
note_level_refs = list(dict.fromkeys(extract_wikilinks(body))) if body else []
|
||||||
|
|
||||||
payload: Dict[str, Any] = {
|
payload: Dict[str, Any] = {
|
||||||
"note_id": fm.get("id") or fm.get("note_id"),
|
"note_id": (fm or {}).get("id") or (fm or {}).get("note_id"),
|
||||||
"title": fm.get("title"),
|
"title": (fm or {}).get("title"),
|
||||||
"type": fm.get("type"),
|
"type": (fm or {}).get("type"),
|
||||||
"status": fm.get("status"),
|
"status": (fm or {}).get("status"),
|
||||||
"created": fm.get("created"),
|
"created": (fm or {}).get("created"),
|
||||||
"updated": fm.get("updated"),
|
"updated": (fm or {}).get("updated"),
|
||||||
"path": path or fm.get("path"),
|
"path": rel_path or (fm or {}).get("path"),
|
||||||
"tags": fm.get("tags"),
|
"tags": (fm or {}).get("tags"),
|
||||||
"hash_fulltext": hash_fulltext,
|
"hash_fulltext": hash_fulltext,
|
||||||
|
# --- WICHTIG: Volltext persistieren ---
|
||||||
|
"fulltext": body,
|
||||||
|
# --- Optionaler Fallback für Edge-Ableitung ---
|
||||||
|
"references": note_level_refs,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Bekannte optionale FM-Felder transparent durchreichen (ohne Hash-Einfluss)
|
# Bekannte optionale Frontmatter-Felder durchreichen
|
||||||
passthrough_keys = [
|
for k in ("area", "project", "source", "lang", "slug"):
|
||||||
"area", "project", "source", "lang", "slug",
|
if k in (fm or {}):
|
||||||
]
|
|
||||||
for k in passthrough_keys:
|
|
||||||
if k in fm:
|
|
||||||
payload[k] = fm[k]
|
payload[k] = fm[k]
|
||||||
|
|
||||||
return payload
|
return payload
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------#
|
# ---------------------------------------------------------------------------
|
||||||
# Optional: Self-Test
|
# CLI – Sichtprüfung
|
||||||
# -----------------------------------------------------------------------------#
|
# ---------------------------------------------------------------------------
|
||||||
if __name__ == "__main__":
|
|
||||||
class _PN:
|
|
||||||
def __init__(self):
|
|
||||||
self.frontmatter = {
|
|
||||||
"id": "demo-123",
|
|
||||||
"title": "Demo",
|
|
||||||
"type": "note",
|
|
||||||
"status": "active",
|
|
||||||
"created": "2025-09-08T10:00:00+00:00",
|
|
||||||
"updated": "2025-09-08T10:00:00+00:00",
|
|
||||||
"tags": ["demo", "test"],
|
|
||||||
}
|
|
||||||
body = "# Überschrift\n\nText."
|
|
||||||
path = "demo.md"
|
|
||||||
|
|
||||||
parsed_dict = {
|
def _cli() -> None:
|
||||||
"frontmatter": {
|
ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen")
|
||||||
"id": "demo-456",
|
ap.add_argument("--from-file", dest="src", required=True, help="Pfad zur Markdown-Datei")
|
||||||
"title": "Demo2",
|
ap.add_argument("--vault-root", dest="vault_root", default=None, help="Vault-Wurzel zur Pfad-Relativierung")
|
||||||
"type": "note",
|
ap.add_argument("--print", dest="do_print", action="store_true", help="Payload auf stdout ausgeben")
|
||||||
"status": "active",
|
args = ap.parse_args()
|
||||||
"created": "2025-09-08T10:00:00+00:00",
|
|
||||||
"updated": "2025-09-08T10:00:00+00:00",
|
|
||||||
},
|
|
||||||
"body": "Text2",
|
|
||||||
"path": "demo2.md",
|
|
||||||
}
|
|
||||||
|
|
||||||
for mode in ("body", "body+frontmatter", "frontmatter"):
|
parsed = parse_markdown(args.src)
|
||||||
os.environ["MINDNET_HASH_MODE"] = mode
|
payload = make_note_payload(parsed, vault_root=args.vault_root)
|
||||||
print(f"\n-- MODE={mode}")
|
|
||||||
print(make_note_payload(_PN()))
|
if args.do_print:
|
||||||
print(make_note_payload(parsed_dict))
|
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
_cli()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user