From 9b0d8c18cb5ee8a34abe4ee8bcbb23efcf6be7ab Mon Sep 17 00:00:00 2001 From: Lars Date: Sun, 11 Jan 2026 21:27:07 +0100 Subject: [PATCH] Implement LLM validation for candidate edges in ingestion_processor.py Enhance the edge validation process by introducing logic to validate edges with rule IDs starting with "candidate:". This includes extracting target IDs, validating against the entire note text, and updating rule IDs upon successful validation. Rejected edges are logged for traceability, improving the overall handling of edge data during ingestion. --- app/core/ingestion/ingestion_processor.py | 68 ++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py index 3c1ee21..bc8cd68 100644 --- a/app/core/ingestion/ingestion_processor.py +++ b/app/core/ingestion/ingestion_processor.py @@ -273,8 +273,74 @@ class IngestionService: markdown_body=markdown_body ) - explicit_edges = [] + # WP-24c v4.5.8: Phase 3 - LLM-Validierung für candidate: Kanten + # Prüfe alle Kanten mit rule_id beginnend mit "candidate:" + # Verwende den gesamten Note-Text für die Validierung + note_text = markdown_body or " ".join([c.get("text", "") or c.get("window", "") for c in chunk_pls]) + validated_edges = [] + rejected_edges = [] + for e in raw_edges: + rule_id = e.get("rule_id", "") + # WP-24c v4.5.8: Trigger-Logik basierend auf rule_id (nicht provenance) + if rule_id and rule_id.startswith("candidate:"): + # Extrahiere target_id für Validierung (aus verschiedenen möglichen Feldern) + target_id = e.get("target_id") or e.get("to") + if not target_id: + # Fallback: Versuche aus Payload zu extrahieren + payload = e.get("extra", {}) if isinstance(e.get("extra"), dict) else {} + target_id = payload.get("target_id") or payload.get("to") + + if not target_id: + logger.warning(f"⚠️ [VALIDATION] Keine target_id gefunden für Kante: {e}") + rejected_edges.append(e) + continue + + kind = e.get("kind", "related_to") + source_id = e.get("source_id", note_id) + + # Erstelle Edge-Dict für Validierung (kompatibel mit validate_edge_candidate) + edge_for_validation = { + "kind": kind, + "to": target_id, # validate_edge_candidate erwartet "to" + "target_id": target_id, + "provenance": e.get("provenance", "explicit"), + "confidence": e.get("confidence", 0.9) + } + + logger.info(f"🚀 [VALIDATION] Prüfe Kandidat: {source_id} --{kind}--> {target_id}") + + # WP-24c v4.5.8: Validiere gegen den gesamten Note-Text + is_valid = await validate_edge_candidate( + chunk_text=note_text, + edge=edge_for_validation, + batch_cache=self.batch_cache, + llm_service=self.llm, + profile_name="ingest_validator" + ) + + if is_valid: + # WP-24c v4.5.8: Entferne candidate: Präfix (Kante wird zum Fakt) + new_rule_id = rule_id.replace("candidate:", "").strip() + if not new_rule_id: + new_rule_id = e.get("provenance", "explicit") + + # Aktualisiere rule_id im Edge (die _edge Funktion merged extra direkt ins Haupt-Dict) + e["rule_id"] = new_rule_id + + validated_edges.append(e) + logger.info(f"✅ [VALIDATION] Kandidat bestätigt: {source_id} --{kind}--> {target_id} -> rule_id: {new_rule_id}") + else: + # WP-24c v4.5.8: Kante ablehnen (nicht zu validated_edges hinzufügen) + rejected_edges.append(e) + logger.info(f"🚫 [VALIDATION] Kandidat abgelehnt: {source_id} --{kind}--> {target_id}") + else: + # WP-24c v4.5.8: Keine candidate: Kante -> direkt übernehmen + validated_edges.append(e) + + # WP-24c v4.5.8: Verwende validated_edges statt raw_edges für weitere Verarbeitung + explicit_edges = [] + for e in validated_edges: t_raw = e.get("target_id") t_ctx = self.batch_cache.get(t_raw) t_id = t_ctx.note_id if t_ctx else t_raw