scriptAudit #11
|
|
@ -1,7 +1,7 @@
|
|||
"""
|
||||
FILE: app/core/ingestion.py
|
||||
DESCRIPTION: Haupt-Ingestion-Logik. Liest Markdown, prüft Hashes (Change Detection), zerlegt in Chunks und schreibt in Qdrant.
|
||||
VERSION: 2.5.3 (Fix: Hash-Mode Full for Metadata Detection)
|
||||
DESCRIPTION: Haupt-Ingestion-Logik. Liest Markdown, steuert Change-Detection (via ENV) und schreibt in Qdrant.
|
||||
VERSION: 2.6.0 (Feat: Flexible Change Detection & Full Logic Restoration)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
|
|
@ -56,9 +56,11 @@ def resolve_note_type(requested: Optional[str], reg: dict) -> str:
|
|||
|
||||
def effective_chunk_profile(note_type: str, reg: dict) -> str:
|
||||
t_cfg = reg.get("types", {}).get(note_type, {})
|
||||
if t_cfg and t_cfg.get("chunk_profile"):
|
||||
return t_cfg.get("chunk_profile")
|
||||
return reg.get("defaults", {}).get("chunk_profile", "default")
|
||||
# FIX: Konsistenz mit note_payload.py - Prüfe erst den korrekten Key
|
||||
if t_cfg:
|
||||
if t_cfg.get("chunking_profile"): return t_cfg.get("chunking_profile")
|
||||
if t_cfg.get("chunk_profile"): return t_cfg.get("chunk_profile") # Legacy
|
||||
return reg.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
||||
|
||||
def effective_retriever_weight(note_type: str, reg: dict) -> float:
|
||||
t_cfg = reg.get("types", {}).get(note_type, {})
|
||||
|
|
@ -79,6 +81,9 @@ class IngestionService:
|
|||
self.registry = load_type_registry()
|
||||
self.embedder = EmbeddingsClient()
|
||||
|
||||
# ACTIVE HASH MODE aus ENV lesen (Default: full)
|
||||
self.active_hash_mode = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full")
|
||||
|
||||
try:
|
||||
ensure_collections(self.client, self.prefix, self.dim)
|
||||
ensure_payload_indexes(self.client, self.prefix)
|
||||
|
|
@ -93,8 +98,8 @@ class IngestionService:
|
|||
apply: bool = False,
|
||||
purge_before: bool = False,
|
||||
note_scope_refs: bool = False,
|
||||
# FIX: Default auf "full", damit Metadata-Änderungen erkannt werden
|
||||
hash_mode: str = "full",
|
||||
# Hash-Mode wird nicht mehr übergeben, sondern via ENV gesteuert.
|
||||
# Source und Normalize bleiben konfigurierbar.
|
||||
hash_source: str = "parsed",
|
||||
hash_normalize: str = "canonical"
|
||||
) -> Dict[str, Any]:
|
||||
|
|
@ -132,11 +137,11 @@ class IngestionService:
|
|||
fm["retriever_weight"] = float(weight)
|
||||
|
||||
# 3. Build Note Payload
|
||||
# Ruft make_note_payload auf, welches JETZT alle Hash-Varianten berechnet.
|
||||
try:
|
||||
note_pl = make_note_payload(
|
||||
parsed,
|
||||
vault_root=vault_root,
|
||||
hash_mode=hash_mode,
|
||||
hash_normalize=hash_normalize,
|
||||
hash_source=hash_source,
|
||||
file_path=file_path
|
||||
|
|
@ -150,25 +155,31 @@ class IngestionService:
|
|||
logger.error(f"Payload build failed: {e}")
|
||||
return {**result, "error": f"Payload build failed: {str(e)}"}
|
||||
|
||||
# 4. Change Detection (Updated Logic)
|
||||
# 4. Change Detection (Updated Logic with ENV Strategy)
|
||||
old_payload = None
|
||||
if not force_replace:
|
||||
old_payload = self._fetch_note_payload(note_id)
|
||||
|
||||
has_old = old_payload is not None
|
||||
key_current = f"{hash_mode}:{hash_source}:{hash_normalize}"
|
||||
|
||||
# Robustere Abfrage: Falls 'hashes' im Payload fehlt, None zurückgeben
|
||||
# Wir bauen den Key basierend auf der ENV-Einstellung
|
||||
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
|
||||
|
||||
old_hashes = (old_payload or {}).get("hashes")
|
||||
|
||||
# Fallback Logik für alte Daten
|
||||
if isinstance(old_hashes, dict):
|
||||
old_hash = old_hashes.get(key_current)
|
||||
old_hash = old_hashes.get(check_key)
|
||||
elif isinstance(old_hashes, str):
|
||||
# Sehr alte Legacy Daten hatten Hash direkt als String (meist Body)
|
||||
# Wenn wir im Body-Modus sind, ist das okay, sonst Force Update
|
||||
old_hash = old_hashes if self.active_hash_mode == "body" else None
|
||||
else:
|
||||
# Fallback für Legacy Payloads ohne Hash-Dict
|
||||
old_hash = None
|
||||
|
||||
new_hash = note_pl.get("hashes", {}).get(key_current)
|
||||
new_hash = note_pl.get("hashes", {}).get(check_key)
|
||||
|
||||
# Wenn wir keinen alten Hash haben (z.B. neues Hash-Schema "full"), erzwingen wir Update
|
||||
# Vergleich
|
||||
hash_changed = (old_hash != new_hash)
|
||||
|
||||
chunks_missing, edges_missing = self._artifacts_missing(note_id)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,10 @@
|
|||
"""
|
||||
FILE: app/core/note_payload.py
|
||||
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_notes'. Inkludiert Hash-Berechnung für Change Detection.
|
||||
VERSION: 2.2.0 (Fix: Missing Hash Calculation)
|
||||
DESCRIPTION: Baut das JSON-Objekt.
|
||||
FEATURES:
|
||||
1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection.
|
||||
2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile').
|
||||
VERSION: 2.3.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: yaml, os, json, pathlib, hashlib
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
|
|
@ -77,7 +80,7 @@ def _ensure_list(x) -> list:
|
|||
return [str(i) for i in x]
|
||||
return [str(x)]
|
||||
|
||||
# --- NEW: Hash Logic ---
|
||||
# --- Hash Logic ---
|
||||
def _compute_hash(content: str) -> str:
|
||||
"""Berechnet einen SHA-256 Hash für den gegebenen String."""
|
||||
if not content:
|
||||
|
|
@ -87,8 +90,6 @@ def _compute_hash(content: str) -> str:
|
|||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||
"""
|
||||
Stellt den String zusammen, der gehasht werden soll.
|
||||
mode="body": Nur der Textinhalt.
|
||||
mode="full": Textinhalt + relevante Metadaten (Titel, Typ, Tags, Status).
|
||||
"""
|
||||
body = str(n.get("body") or "")
|
||||
|
||||
|
|
@ -96,13 +97,12 @@ def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
|||
return body
|
||||
|
||||
if mode == "full":
|
||||
# Wir nehmen die wichtigsten strukturellen Metadaten dazu.
|
||||
# Wichtig: Sortierte Keys für deterministisches Verhalten!
|
||||
fm = n.get("frontmatter") or {}
|
||||
|
||||
# Extrahieren relevanter Meta-Felder, die Änderungen auslösen sollen
|
||||
# Wichtig: Sortierte Keys für deterministisches Verhalten!
|
||||
# Wir nehmen alle steuernden Metadaten auf
|
||||
meta_parts = []
|
||||
for k in sorted(["title", "type", "status", "tags", "chunk_profile"]):
|
||||
# Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden
|
||||
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
||||
val = fm.get(k)
|
||||
if val is not None:
|
||||
meta_parts.append(f"{k}:{val}")
|
||||
|
|
@ -157,14 +157,13 @@ def _cfg_defaults(reg: dict) -> dict:
|
|||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Baut das Note-Payload für mindnet_notes auf.
|
||||
Inkludiert Hash-Berechnung.
|
||||
Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups.
|
||||
"""
|
||||
n = _as_dict(note)
|
||||
path_arg, types_cfg_explicit = _pick_args(*args, **kwargs)
|
||||
reg = _load_types_config(types_cfg_explicit)
|
||||
|
||||
# Hash Config extrahieren (Defaults wie in ingestion.py)
|
||||
hash_mode = kwargs.get("hash_mode", "full")
|
||||
# Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide')
|
||||
hash_source = kwargs.get("hash_source", "parsed")
|
||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||
|
||||
|
|
@ -188,18 +187,21 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
|||
except Exception:
|
||||
retriever_weight = default_rw
|
||||
|
||||
# --- chunk_profile ---
|
||||
chunk_profile = fm.get("chunk_profile")
|
||||
# --- chunk_profile (FIXED LOGIC) ---
|
||||
# 1. Frontmatter Override (beide Schreibweisen erlaubt)
|
||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
|
||||
# 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml)
|
||||
if chunk_profile is None:
|
||||
chunk_profile = cfg_type.get(
|
||||
"chunk_profile",
|
||||
cfg_def.get(
|
||||
"chunk_profile",
|
||||
os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE", "medium"),
|
||||
),
|
||||
)
|
||||
if not isinstance(chunk_profile, str):
|
||||
chunk_profile = "medium"
|
||||
chunk_profile = cfg_type.get("chunking_profile")
|
||||
|
||||
# 3. Default Config (Fallback auf sliding_standard statt medium)
|
||||
if chunk_profile is None:
|
||||
chunk_profile = cfg_def.get("chunking_profile", "sliding_standard")
|
||||
|
||||
# 4. Safety Fallback
|
||||
if not isinstance(chunk_profile, str) or not chunk_profile:
|
||||
chunk_profile = "sliding_standard"
|
||||
|
||||
# --- edge_defaults ---
|
||||
edge_defaults = fm.get("edge_defaults")
|
||||
|
|
@ -225,20 +227,20 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
|||
"retriever_weight": retriever_weight,
|
||||
"chunk_profile": chunk_profile,
|
||||
"edge_defaults": edge_defaults,
|
||||
"hashes": {} # Init Hash Dict
|
||||
}
|
||||
|
||||
# --- FIX: Hash Calculation ---
|
||||
# Wir berechnen den Hash basierend auf dem Modus
|
||||
content_to_hash = _get_hash_source_content(n, hash_mode)
|
||||
computed_hash = _compute_hash(content_to_hash)
|
||||
# --- MULTI-HASH CALCULATION (Strategy Decoupling) ---
|
||||
# Wir berechnen immer BEIDE Strategien und speichern sie.
|
||||
# ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird.
|
||||
modes_to_calc = ["body", "full"]
|
||||
|
||||
# Der Key muss exakt dem entsprechen, was ingestion.py erwartet:
|
||||
# key_current = f"{hash_mode}:{hash_source}:{hash_normalize}"
|
||||
hash_key = f"{hash_mode}:{hash_source}:{hash_normalize}"
|
||||
|
||||
payload["hashes"] = {
|
||||
hash_key: computed_hash
|
||||
}
|
||||
for mode in modes_to_calc:
|
||||
content_to_hash = _get_hash_source_content(n, mode)
|
||||
computed_hash = _compute_hash(content_to_hash)
|
||||
# Key Schema: mode:source:normalize (z.B. "full:parsed:canonical")
|
||||
key = f"{mode}:{hash_source}:{hash_normalize}"
|
||||
payload["hashes"][key] = computed_hash
|
||||
|
||||
# Tags / Keywords
|
||||
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user