82 lines
3.0 KiB
Python
82 lines
3.0 KiB
Python
"""
|
|
FILE: app/core/ingestion/ingestion_note_payload.py
|
|
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
|
FEATURES: Multi-Hash (body/full), Config-Fix für chunking_profile.
|
|
VERSION: 2.4.0
|
|
"""
|
|
from __future__ import annotations
|
|
from typing import Any, Dict, Tuple, Optional
|
|
import os
|
|
import json
|
|
import pathlib
|
|
import hashlib
|
|
import yaml
|
|
|
|
def _as_dict(x) -> Dict[str, Any]:
|
|
if isinstance(x, dict): return dict(x)
|
|
out: Dict[str, Any] = {}
|
|
for attr in ("frontmatter", "body", "id", "note_id", "title", "path", "tags", "type", "created", "modified", "date"):
|
|
if hasattr(x, attr):
|
|
val = getattr(x, attr)
|
|
if val is not None: out[attr] = val
|
|
if not out: out["raw"] = str(x)
|
|
return out
|
|
|
|
def _ensure_list(x) -> list:
|
|
if x is None: return []
|
|
if isinstance(x, list): return [str(i) for i in x]
|
|
if isinstance(x, (set, tuple)): return [str(i) for i in x]
|
|
return [str(x)]
|
|
|
|
def _compute_hash(content: str) -> str:
|
|
if not content: return ""
|
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
|
|
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
|
body = str(n.get("body") or "")
|
|
if mode == "body": return body
|
|
if mode == "full":
|
|
fm = n.get("frontmatter") or {}
|
|
meta_parts = []
|
|
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
|
val = fm.get(k)
|
|
if val is not None: meta_parts.append(f"{k}:{val}")
|
|
return f" {'|'.join(meta_parts)}||{body}"
|
|
return body
|
|
|
|
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
|
"""Baut das Note-Payload inklusive Multi-Hash."""
|
|
n = _as_dict(note)
|
|
reg = kwargs.get("types_cfg") or {}
|
|
hash_source = kwargs.get("hash_source", "parsed")
|
|
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
|
|
|
fm = n.get("frontmatter") or {}
|
|
note_type = str(fm.get("type") or n.get("type") or "concept")
|
|
|
|
# Weights & Profiles
|
|
retriever_weight = fm.get("retriever_weight", 1.0)
|
|
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") or "sliding_standard"
|
|
|
|
payload: Dict[str, Any] = {
|
|
"note_id": n.get("note_id") or n.get("id") or fm.get("id"),
|
|
"title": n.get("title") or fm.get("title") or "",
|
|
"type": note_type,
|
|
"path": str(n.get("path") or kwargs.get("path") or ""),
|
|
"retriever_weight": float(retriever_weight),
|
|
"chunk_profile": chunk_profile,
|
|
"hashes": {}
|
|
}
|
|
|
|
for mode in ["body", "full"]:
|
|
key = f"{mode}:{hash_source}:{hash_normalize}"
|
|
payload["hashes"][key] = _compute_hash(_get_hash_source_content(n, mode))
|
|
|
|
if fm.get("tags") or n.get("tags"): payload["tags"] = _ensure_list(fm.get("tags") or n.get("tags"))
|
|
if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases"))
|
|
for k in ("created", "modified", "date"):
|
|
v = fm.get(k) or n.get(k)
|
|
if v: payload[k] = str(v)
|
|
if n.get("body"): payload["fulltext"] = str(n["body"])
|
|
|
|
return payload |