mindnet/app/core/ingestion/ingestion_validation.py

67 lines
2.5 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
FILE: app/core/ingestion/ingestion_validation.py
DESCRIPTION: WP-15b semantische Validierung von Kanten gegen den LocalBatchCache.
AUDIT v2.12.3: Integration der zentralen Text-Bereinigung (WP-14).
"""
import logging
from typing import Dict, Any
from app.core.parser import NoteContext
# ENTSCHEIDENDER FIX: Import der neutralen Bereinigungs-Logik zur Vermeidung von Circular Imports
from app.core.registry import clean_llm_text
logger = logging.getLogger(__name__)
async def validate_edge_candidate(
chunk_text: str,
edge: Dict,
batch_cache: Dict[str, NoteContext],
llm_service: Any,
provider: str
) -> bool:
"""
WP-15b: Validiert einen Kandidaten semantisch gegen das Ziel im Cache.
Nutzt clean_llm_text zur Entfernung von Steuerzeichen vor der Auswertung.
"""
target_id = edge.get("to")
target_ctx = batch_cache.get(target_id)
# Robust Lookup Fix (v2.12.2): Support für Anker
if not target_ctx and "#" in target_id:
base_id = target_id.split("#")[0]
target_ctx = batch_cache.get(base_id)
# Sicherheits-Fallback (Hard-Link Integrity)
if not target_ctx:
logger.info(f" [VALIDATION SKIP] No context for '{target_id}' - allowing link.")
return True
template = llm_service.get_prompt("edge_validation", provider)
try:
logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}'...")
prompt = template.format(
chunk_text=chunk_text[:1500],
target_title=target_ctx.title,
target_summary=target_ctx.summary,
edge_kind=edge.get("kind", "related_to")
)
# Die Antwort vom Service anfordern
raw_response = await llm_service.generate_raw_response(prompt, priority="background")
# WP-14 Fix: Zusätzliche Bereinigung zur Sicherstellung der Interpretierbarkeit
response = clean_llm_text(raw_response)
# Semantische Prüfung des Ergebnisses
is_valid = "YES" in response.upper()
if is_valid:
logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.")
else:
logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.")
return is_valid
except Exception as e:
logger.warning(f"⚠️ Validation error for {target_id}: {e}")
# Im Zweifel (Timeout/Fehler) erlauben wir die Kante, um Datenverlust zu vermeiden
return True