mindnet/app/core/ingestion/ingestion_validation.py

"""
FILE: app/core/ingestion/ingestion_validation.py
DESCRIPTION: WP-15b semantische Validierung von Kanten gegen den LocalBatchCache.
             AUDIT v2.12.3: Integration der zentralen Text-Bereinigung (WP-14).
"""
import logging
from typing import Dict, Any
from app.core.parser import NoteContext

# ENTSCHEIDENDER FIX: Import der neutralen Bereinigungs-Logik zur Vermeidung von Circular Imports
from app.core.registry import clean_llm_text

logger = logging.getLogger(__name__)

async def validate_edge_candidate(
    chunk_text: str,
    edge: Dict,
    batch_cache: Dict[str, NoteContext],
    llm_service: Any,
    provider: str
) -> bool:
    """
    WP-15b: Validiert einen Kandidaten semantisch gegen das Ziel im Cache.
    Nutzt clean_llm_text zur Entfernung von Steuerzeichen vor der Auswertung.
    """
    target_id = edge.get("to")
    target_ctx = batch_cache.get(target_id)

    # Robust Lookup Fix (v2.12.2): Support für Anker
    if not target_ctx and "#" in target_id:
        base_id = target_id.split("#")[0]
        target_ctx = batch_cache.get(base_id)

    # Sicherheits-Fallback (Hard-Link Integrity)
    if not target_ctx:
        logger.info(f"ℹ️ [VALIDATION SKIP] No context for '{target_id}' - allowing link.")
        return True

    template = llm_service.get_prompt("edge_validation", provider)

    try:
        logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}'...")
        prompt = template.format(
            chunk_text=chunk_text[:1500],
            target_title=target_ctx.title,
            target_summary=target_ctx.summary,
            edge_kind=edge.get("kind", "related_to")
        )

        # Die Antwort vom Service anfordern
        raw_response = await llm_service.generate_raw_response(prompt, priority="background")

        # WP-14 Fix: Zusätzliche Bereinigung zur Sicherstellung der Interpretierbarkeit
        response = clean_llm_text(raw_response)

        # Semantische Prüfung des Ergebnisses
        is_valid = "YES" in response.upper()

        if is_valid:
            logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.")
        else:
            logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.")
        return is_valid
    except Exception as e:
        logger.warning(f"⚠️ Validation error for {target_id}: {e}")
        # Im Zweifel (Timeout/Fehler) erlauben wir die Kante, um Datenverlust zu vermeiden
        return True