mindnet/app/core/ingestion/ingestion_validation.py

81 lines
3.2 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
FILE: app/core/ingestion/ingestion_validation.py
DESCRIPTION: WP-15b semantische Validierung von Kanten gegen den LocalBatchCache.
WP-25a: Integration der Mixture of Experts (MoE) Profil-Steuerung.
VERSION: 2.13.0 (WP-25a: MoE & Profile Support)
STATUS: Active
FIX:
- Umstellung auf generate_raw_response mit profile_name="ingest_validator".
- Automatische Nutzung der Fallback-Kaskade (Cloud -> Lokal) via LLMService.
- Erhalt der sparsamen LLM-Nutzung (Validierung nur für Kandidaten-Pool).
"""
import logging
from typing import Dict, Any, Optional
from app.core.parser import NoteContext
# ENTSCHEIDENDER FIX: Import der neutralen Bereinigungs-Logik zur Vermeidung von Circular Imports
from app.core.registry import clean_llm_text
logger = logging.getLogger(__name__)
async def validate_edge_candidate(
chunk_text: str,
edge: Dict,
batch_cache: Dict[str, NoteContext],
llm_service: Any,
provider: Optional[str] = None,
profile_name: str = "ingest_validator"
) -> bool:
"""
WP-15b: Validiert einen Kandidaten semantisch gegen das Ziel im Cache.
Nutzt das MoE-Profil 'ingest_validator' für deterministische YES/NO Prüfungen.
"""
target_id = edge.get("to")
target_ctx = batch_cache.get(target_id)
# Robust Lookup Fix (v2.12.2): Support für Anker
if not target_ctx and "#" in target_id:
base_id = target_id.split("#")[0]
target_ctx = batch_cache.get(base_id)
# Sicherheits-Fallback (Hard-Link Integrity)
# Explizite Wikilinks oder Callouts werden nicht durch das LLM verifiziert.
if not target_ctx:
logger.info(f" [VALIDATION SKIP] No context for '{target_id}' - allowing link.")
return True
# Prompt-Abruf (Nutzt Provider-String als Fallback-Key für die prompts.yaml)
template = llm_service.get_prompt("edge_validation", provider)
try:
logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}' (Profile: {profile_name})...")
prompt = template.format(
chunk_text=chunk_text[:1500],
target_title=target_ctx.title,
target_summary=target_ctx.summary,
edge_kind=edge.get("kind", "related_to")
)
# WP-25a: Profilbasierter Aufruf (Delegiert Fallbacks an den Service)
# Nutzt ingest_validator (Cloud Mistral/Gemini -> Local Phi3:mini Kaskade)
raw_response = await llm_service.generate_raw_response(
prompt,
priority="background",
profile_name=profile_name
)
# WP-14 Fix: Zusätzliche Bereinigung zur Sicherstellung der Interpretierbarkeit
response = clean_llm_text(raw_response)
# Semantische Prüfung des Ergebnisses
is_valid = "YES" in response.upper()
if is_valid:
logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.")
else:
logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.")
return is_valid
except Exception as e:
logger.warning(f"⚠️ Validation error for {target_id} using {profile_name}: {e}")
# Im Zweifel (Timeout/Fehler) erlauben wir die Kante, um Datenverlust zu vermeiden
return True