Implement LLM validation for candidate edges in ingestion_processor.py

Enhance the edge validation process by introducing logic to validate edges with rule IDs starting with "candidate:". This includes extracting target IDs, validating against the entire note text, and updating rule IDs upon successful validation. Rejected edges are logged for traceability, improving the overall handling of edge data during ingestion.
This commit is contained in:
Lars 2026-01-11 21:27:07 +01:00
parent f2a2f4d2df
commit 9b0d8c18cb

View File

@ -273,8 +273,74 @@ class IngestionService:
markdown_body=markdown_body markdown_body=markdown_body
) )
explicit_edges = [] # WP-24c v4.5.8: Phase 3 - LLM-Validierung für candidate: Kanten
# Prüfe alle Kanten mit rule_id beginnend mit "candidate:"
# Verwende den gesamten Note-Text für die Validierung
note_text = markdown_body or " ".join([c.get("text", "") or c.get("window", "") for c in chunk_pls])
validated_edges = []
rejected_edges = []
for e in raw_edges: for e in raw_edges:
rule_id = e.get("rule_id", "")
# WP-24c v4.5.8: Trigger-Logik basierend auf rule_id (nicht provenance)
if rule_id and rule_id.startswith("candidate:"):
# Extrahiere target_id für Validierung (aus verschiedenen möglichen Feldern)
target_id = e.get("target_id") or e.get("to")
if not target_id:
# Fallback: Versuche aus Payload zu extrahieren
payload = e.get("extra", {}) if isinstance(e.get("extra"), dict) else {}
target_id = payload.get("target_id") or payload.get("to")
if not target_id:
logger.warning(f"⚠️ [VALIDATION] Keine target_id gefunden für Kante: {e}")
rejected_edges.append(e)
continue
kind = e.get("kind", "related_to")
source_id = e.get("source_id", note_id)
# Erstelle Edge-Dict für Validierung (kompatibel mit validate_edge_candidate)
edge_for_validation = {
"kind": kind,
"to": target_id, # validate_edge_candidate erwartet "to"
"target_id": target_id,
"provenance": e.get("provenance", "explicit"),
"confidence": e.get("confidence", 0.9)
}
logger.info(f"🚀 [VALIDATION] Prüfe Kandidat: {source_id} --{kind}--> {target_id}")
# WP-24c v4.5.8: Validiere gegen den gesamten Note-Text
is_valid = await validate_edge_candidate(
chunk_text=note_text,
edge=edge_for_validation,
batch_cache=self.batch_cache,
llm_service=self.llm,
profile_name="ingest_validator"
)
if is_valid:
# WP-24c v4.5.8: Entferne candidate: Präfix (Kante wird zum Fakt)
new_rule_id = rule_id.replace("candidate:", "").strip()
if not new_rule_id:
new_rule_id = e.get("provenance", "explicit")
# Aktualisiere rule_id im Edge (die _edge Funktion merged extra direkt ins Haupt-Dict)
e["rule_id"] = new_rule_id
validated_edges.append(e)
logger.info(f"✅ [VALIDATION] Kandidat bestätigt: {source_id} --{kind}--> {target_id} -> rule_id: {new_rule_id}")
else:
# WP-24c v4.5.8: Kante ablehnen (nicht zu validated_edges hinzufügen)
rejected_edges.append(e)
logger.info(f"🚫 [VALIDATION] Kandidat abgelehnt: {source_id} --{kind}--> {target_id}")
else:
# WP-24c v4.5.8: Keine candidate: Kante -> direkt übernehmen
validated_edges.append(e)
# WP-24c v4.5.8: Verwende validated_edges statt raw_edges für weitere Verarbeitung
explicit_edges = []
for e in validated_edges:
t_raw = e.get("target_id") t_raw = e.get("target_id")
t_ctx = self.batch_cache.get(t_raw) t_ctx = self.batch_cache.get(t_raw)
t_id = t_ctx.note_id if t_ctx else t_raw t_id = t_ctx.note_id if t_ctx else t_raw