scriptAudit #11

Merged
Lars merged 24 commits from scriptAudit into main 2025-12-16 18:55:45 +01:00
2 changed files with 63 additions and 50 deletions
Showing only changes of commit a272c39613 - Show all commits

View File

@ -1,7 +1,7 @@
"""
FILE: app/core/ingestion.py
DESCRIPTION: Haupt-Ingestion-Logik. Liest Markdown, prüft Hashes (Change Detection), zerlegt in Chunks und schreibt in Qdrant.
VERSION: 2.5.3 (Fix: Hash-Mode Full for Metadata Detection)
DESCRIPTION: Haupt-Ingestion-Logik. Liest Markdown, steuert Change-Detection (via ENV) und schreibt in Qdrant.
VERSION: 2.6.0 (Feat: Flexible Change Detection & Full Logic Restoration)
STATUS: Active
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client
EXTERNAL_CONFIG: config/types.yaml
@ -56,9 +56,11 @@ def resolve_note_type(requested: Optional[str], reg: dict) -> str:
def effective_chunk_profile(note_type: str, reg: dict) -> str:
t_cfg = reg.get("types", {}).get(note_type, {})
if t_cfg and t_cfg.get("chunk_profile"):
return t_cfg.get("chunk_profile")
return reg.get("defaults", {}).get("chunk_profile", "default")
# FIX: Konsistenz mit note_payload.py - Prüfe erst den korrekten Key
if t_cfg:
if t_cfg.get("chunking_profile"): return t_cfg.get("chunking_profile")
if t_cfg.get("chunk_profile"): return t_cfg.get("chunk_profile") # Legacy
return reg.get("defaults", {}).get("chunking_profile", "sliding_standard")
def effective_retriever_weight(note_type: str, reg: dict) -> float:
t_cfg = reg.get("types", {}).get(note_type, {})
@ -79,6 +81,9 @@ class IngestionService:
self.registry = load_type_registry()
self.embedder = EmbeddingsClient()
# ACTIVE HASH MODE aus ENV lesen (Default: full)
self.active_hash_mode = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full")
try:
ensure_collections(self.client, self.prefix, self.dim)
ensure_payload_indexes(self.client, self.prefix)
@ -93,8 +98,8 @@ class IngestionService:
apply: bool = False,
purge_before: bool = False,
note_scope_refs: bool = False,
# FIX: Default auf "full", damit Metadata-Änderungen erkannt werden
hash_mode: str = "full",
# Hash-Mode wird nicht mehr übergeben, sondern via ENV gesteuert.
# Source und Normalize bleiben konfigurierbar.
hash_source: str = "parsed",
hash_normalize: str = "canonical"
) -> Dict[str, Any]:
@ -132,11 +137,11 @@ class IngestionService:
fm["retriever_weight"] = float(weight)
# 3. Build Note Payload
# Ruft make_note_payload auf, welches JETZT alle Hash-Varianten berechnet.
try:
note_pl = make_note_payload(
parsed,
vault_root=vault_root,
hash_mode=hash_mode,
hash_normalize=hash_normalize,
hash_source=hash_source,
file_path=file_path
@ -150,25 +155,31 @@ class IngestionService:
logger.error(f"Payload build failed: {e}")
return {**result, "error": f"Payload build failed: {str(e)}"}
# 4. Change Detection (Updated Logic)
# 4. Change Detection (Updated Logic with ENV Strategy)
old_payload = None
if not force_replace:
old_payload = self._fetch_note_payload(note_id)
has_old = old_payload is not None
key_current = f"{hash_mode}:{hash_source}:{hash_normalize}"
# Robustere Abfrage: Falls 'hashes' im Payload fehlt, None zurückgeben
# Wir bauen den Key basierend auf der ENV-Einstellung
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
old_hashes = (old_payload or {}).get("hashes")
# Fallback Logik für alte Daten
if isinstance(old_hashes, dict):
old_hash = old_hashes.get(key_current)
old_hash = old_hashes.get(check_key)
elif isinstance(old_hashes, str):
# Sehr alte Legacy Daten hatten Hash direkt als String (meist Body)
# Wenn wir im Body-Modus sind, ist das okay, sonst Force Update
old_hash = old_hashes if self.active_hash_mode == "body" else None
else:
# Fallback für Legacy Payloads ohne Hash-Dict
old_hash = None
new_hash = note_pl.get("hashes", {}).get(key_current)
new_hash = note_pl.get("hashes", {}).get(check_key)
# Wenn wir keinen alten Hash haben (z.B. neues Hash-Schema "full"), erzwingen wir Update
# Vergleich
hash_changed = (old_hash != new_hash)
chunks_missing, edges_missing = self._artifacts_missing(note_id)

View File

@ -1,7 +1,10 @@
"""
FILE: app/core/note_payload.py
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_notes'. Inkludiert Hash-Berechnung für Change Detection.
VERSION: 2.2.0 (Fix: Missing Hash Calculation)
DESCRIPTION: Baut das JSON-Objekt.
FEATURES:
1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection.
2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile').
VERSION: 2.3.0
STATUS: Active
DEPENDENCIES: yaml, os, json, pathlib, hashlib
EXTERNAL_CONFIG: config/types.yaml
@ -77,7 +80,7 @@ def _ensure_list(x) -> list:
return [str(i) for i in x]
return [str(x)]
# --- NEW: Hash Logic ---
# --- Hash Logic ---
def _compute_hash(content: str) -> str:
"""Berechnet einen SHA-256 Hash für den gegebenen String."""
if not content:
@ -87,8 +90,6 @@ def _compute_hash(content: str) -> str:
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
"""
Stellt den String zusammen, der gehasht werden soll.
mode="body": Nur der Textinhalt.
mode="full": Textinhalt + relevante Metadaten (Titel, Typ, Tags, Status).
"""
body = str(n.get("body") or "")
@ -96,13 +97,12 @@ def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
return body
if mode == "full":
# Wir nehmen die wichtigsten strukturellen Metadaten dazu.
# Wichtig: Sortierte Keys für deterministisches Verhalten!
fm = n.get("frontmatter") or {}
# Extrahieren relevanter Meta-Felder, die Änderungen auslösen sollen
# Wichtig: Sortierte Keys für deterministisches Verhalten!
# Wir nehmen alle steuernden Metadaten auf
meta_parts = []
for k in sorted(["title", "type", "status", "tags", "chunk_profile"]):
# Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
val = fm.get(k)
if val is not None:
meta_parts.append(f"{k}:{val}")
@ -157,14 +157,13 @@ def _cfg_defaults(reg: dict) -> dict:
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
"""
Baut das Note-Payload für mindnet_notes auf.
Inkludiert Hash-Berechnung.
Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups.
"""
n = _as_dict(note)
path_arg, types_cfg_explicit = _pick_args(*args, **kwargs)
reg = _load_types_config(types_cfg_explicit)
# Hash Config extrahieren (Defaults wie in ingestion.py)
hash_mode = kwargs.get("hash_mode", "full")
# Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide')
hash_source = kwargs.get("hash_source", "parsed")
hash_normalize = kwargs.get("hash_normalize", "canonical")
@ -188,18 +187,21 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
except Exception:
retriever_weight = default_rw
# --- chunk_profile ---
chunk_profile = fm.get("chunk_profile")
# --- chunk_profile (FIXED LOGIC) ---
# 1. Frontmatter Override (beide Schreibweisen erlaubt)
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
# 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml)
if chunk_profile is None:
chunk_profile = cfg_type.get(
"chunk_profile",
cfg_def.get(
"chunk_profile",
os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE", "medium"),
),
)
if not isinstance(chunk_profile, str):
chunk_profile = "medium"
chunk_profile = cfg_type.get("chunking_profile")
# 3. Default Config (Fallback auf sliding_standard statt medium)
if chunk_profile is None:
chunk_profile = cfg_def.get("chunking_profile", "sliding_standard")
# 4. Safety Fallback
if not isinstance(chunk_profile, str) or not chunk_profile:
chunk_profile = "sliding_standard"
# --- edge_defaults ---
edge_defaults = fm.get("edge_defaults")
@ -225,20 +227,20 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
"retriever_weight": retriever_weight,
"chunk_profile": chunk_profile,
"edge_defaults": edge_defaults,
"hashes": {} # Init Hash Dict
}
# --- FIX: Hash Calculation ---
# Wir berechnen den Hash basierend auf dem Modus
content_to_hash = _get_hash_source_content(n, hash_mode)
computed_hash = _compute_hash(content_to_hash)
# --- MULTI-HASH CALCULATION (Strategy Decoupling) ---
# Wir berechnen immer BEIDE Strategien und speichern sie.
# ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird.
modes_to_calc = ["body", "full"]
# Der Key muss exakt dem entsprechen, was ingestion.py erwartet:
# key_current = f"{hash_mode}:{hash_source}:{hash_normalize}"
hash_key = f"{hash_mode}:{hash_source}:{hash_normalize}"
payload["hashes"] = {
hash_key: computed_hash
}
for mode in modes_to_calc:
content_to_hash = _get_hash_source_content(n, mode)
computed_hash = _compute_hash(content_to_hash)
# Key Schema: mode:source:normalize (z.B. "full:parsed:canonical")
key = f"{mode}:{hash_source}:{hash_normalize}"
payload["hashes"][key] = computed_hash
# Tags / Keywords
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")