Refactor edge validation process in ingestion_processor.py

Remove LLM validation from the candidate edge processing loop, shifting it to a later phase for improved context handling. Introduce a new validation mechanism that aggregates note text for better decision-making and optimizes the validation criteria to include both rule IDs and provenance. Update logging to reflect the new validation phases and ensure rejected edges are not processed further. This enhances the overall efficiency and accuracy of edge validation during ingestion.
2026-01-11 21:47:11 +01:00 · 2026-01-11 21:47:11 +01:00 · b19f91c3ee
commit b19f91c3ee
parent 9b0d8c18cb
1 changed files with 60 additions and 27 deletions
--- a/app/core/ingestion/ingestion_processor.py
+++ b/app/core/ingestion/ingestion_processor.py
@ -239,21 +239,19 @@ class IngestionService:
            enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
            chunks = await assemble_chunks(note_id, getattr(parsed, "body", ""), note_type, config=chunk_cfg)
            
+            # WP-24c v4.5.8: Validierung in Chunk-Schleife entfernt
+            # Alle candidate: Kanten werden jetzt in Phase 3 (nach build_edges_for_note) validiert
+            # Dies stellt sicher, dass auch Note-Scope Kanten aus LLM-Validierungs-Zonen geprüft werden
+            # Der candidate_pool wird unverändert weitergegeben, damit build_edges_for_note alle Kanten erkennt
+            # WP-24c v4.5.8: Nur ID-Validierung bleibt (Ghost-ID Schutz), keine LLM-Validierung mehr hier
            for ch in chunks:
                new_pool = []
                for cand in getattr(ch, "candidate_pool", []):
-                    # WP-24c v4.4.1: Harmonisierung - akzeptiere sowohl "to" als auch "target_id"
-                    # Der chunking_processor verwendet "to", daher muss die Validierung beide Keys unterstützen
+                    # WP-24c v4.5.8: Nur ID-Validierung (Ghost-ID Schutz)
                    t_id = cand.get('target_id') or cand.get('to') or cand.get('note_id')
-                    if not self._is_valid_id(t_id): continue
-                    
-                    # WP-24c v4.4.1: explicit:callout Kanten werden NICHT validiert (bereits präzise)
-                    # Sie müssen den Pool passieren, damit sie in Phase 1 erkannt werden
-                    if cand.get("provenance") == "global_pool" and enable_smart:
-                        is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm)
-                        if is_valid: new_pool.append(cand)
-                    else: 
-                        # WP-24c v4.4.1: Alle anderen Provenances (inkl. explicit:callout) passieren ohne Validierung
+                    if not self._is_valid_id(t_id): 
+                        continue
+                    # WP-24c v4.5.8: Alle Kanten gehen durch - LLM-Validierung erfolgt in Phase 3
                    new_pool.append(cand)
                ch.candidate_pool = new_pool

@ -273,17 +271,27 @@ class IngestionService:
                markdown_body=markdown_body
            )
            
-            # WP-24c v4.5.8: Phase 3 - LLM-Validierung für candidate: Kanten
-            # Prüfe alle Kanten mit rule_id beginnend mit "candidate:"
-            # Verwende den gesamten Note-Text für die Validierung
+            # WP-24c v4.5.8: Phase 3 - Finaler Validierungs-Gate für candidate: Kanten
+            # Prüfe alle Kanten mit rule_id ODER provenance beginnend mit "candidate:"
+            # Dies schließt alle Kandidaten ein, unabhängig von ihrer Herkunft (global_pool, explicit:callout, etc.)
+            
+            # WP-24c v4.5.8: Kontext-Optimierung für Note-Scope Kanten
+            # Aggregiere den gesamten Note-Text für bessere Validierungs-Entscheidungen
            note_text = markdown_body or " ".join([c.get("text", "") or c.get("window", "") for c in chunk_pls])
+            # Erstelle eine Note-Summary aus den wichtigsten Chunks (für bessere Kontext-Qualität)
+            note_summary = " ".join([c.get("window", "") or c.get("text", "") for c in chunk_pls[:5]])  # Top 5 Chunks
+            
            validated_edges = []
            rejected_edges = []
            
            for e in raw_edges:
                rule_id = e.get("rule_id", "")
-                # WP-24c v4.5.8: Trigger-Logik basierend auf rule_id (nicht provenance)
-                if rule_id and rule_id.startswith("candidate:"):
+                provenance = e.get("provenance", "")
+                
+                # WP-24c v4.5.8: Trigger-Kriterium - rule_id ODER provenance beginnt mit "candidate:"
+                is_candidate = (rule_id and rule_id.startswith("candidate:")) or (provenance and provenance.startswith("candidate:"))
+                
+                if is_candidate:
                    # Extrahiere target_id für Validierung (aus verschiedenen möglichen Feldern)
                    target_id = e.get("target_id") or e.get("to")
                    if not target_id:
@ -292,27 +300,45 @@ class IngestionService:
                        target_id = payload.get("target_id") or payload.get("to")
                    
                    if not target_id:
-                        logger.warning(f"⚠️ [VALIDATION] Keine target_id gefunden für Kante: {e}")
+                        logger.warning(f"⚠️ [PHASE 3] Keine target_id gefunden für Kante: {e}")
                        rejected_edges.append(e)
                        continue
                    
                    kind = e.get("kind", "related_to")
                    source_id = e.get("source_id", note_id)
+                    scope = e.get("scope", "chunk")
+                    
+                    # WP-24c v4.5.8: Kontext-Optimierung für Note-Scope Kanten
+                    # Für scope: note verwende Note-Summary oder gesamten Note-Text
+                    # Für scope: chunk verwende den spezifischen Chunk-Text (falls verfügbar)
+                    if scope == "note":
+                        validation_text = note_summary or note_text
+                        context_info = "Note-Scope (aggregiert)"
+                    else:
+                        # Für Chunk-Scope: Versuche Chunk-Text zu finden, sonst Note-Text
+                        chunk_id = e.get("chunk_id") or source_id
+                        chunk_text = None
+                        for ch in chunk_pls:
+                            if ch.get("chunk_id") == chunk_id or ch.get("id") == chunk_id:
+                                chunk_text = ch.get("text") or ch.get("window", "")
+                                break
+                        validation_text = chunk_text or note_text
+                        context_info = f"Chunk-Scope ({chunk_id})"
                    
                    # Erstelle Edge-Dict für Validierung (kompatibel mit validate_edge_candidate)
                    edge_for_validation = {
                        "kind": kind,
                        "to": target_id,  # validate_edge_candidate erwartet "to"
                        "target_id": target_id,
-                        "provenance": e.get("provenance", "explicit"),
+                        "provenance": provenance if not provenance.startswith("candidate:") else provenance.replace("candidate:", "").strip(),
                        "confidence": e.get("confidence", 0.9)
                    }
                    
-                    logger.info(f"🚀 [VALIDATION] Prüfe Kandidat: {source_id} --{kind}--> {target_id}")
+                    logger.info(f"🚀 [PHASE 3] Validierung: {source_id} -> {target_id} ({kind}) | Scope: {scope} | Kontext: {context_info}")
                    
-                    # WP-24c v4.5.8: Validiere gegen den gesamten Note-Text
+                    # WP-24c v4.5.8: Validiere gegen optimierten Kontext
                    is_valid = await validate_edge_candidate(
-                        chunk_text=note_text,
+                        chunk_text=validation_text,
                        edge=edge_for_validation,
                        batch_cache=self.batch_cache,
                        llm_service=self.llm,
@ -321,24 +347,31 @@ class IngestionService:
                    
                    if is_valid:
                        # WP-24c v4.5.8: Entferne candidate: Präfix (Kante wird zum Fakt)
-                        new_rule_id = rule_id.replace("candidate:", "").strip()
+                        new_rule_id = rule_id.replace("candidate:", "").strip() if rule_id else provenance.replace("candidate:", "").strip() if provenance.startswith("candidate:") else provenance
                        if not new_rule_id:
-                            new_rule_id = e.get("provenance", "explicit")
+                            new_rule_id = e.get("provenance", "explicit").replace("candidate:", "").strip()
                        
-                        # Aktualisiere rule_id im Edge (die _edge Funktion merged extra direkt ins Haupt-Dict)
+                        # Aktualisiere rule_id und provenance im Edge
                        e["rule_id"] = new_rule_id
+                        if provenance.startswith("candidate:"):
+                            e["provenance"] = provenance.replace("candidate:", "").strip()
                        
                        validated_edges.append(e)
-                        logger.info(f"✅ [VALIDATION] Kandidat bestätigt: {source_id} --{kind}--> {target_id} -> rule_id: {new_rule_id}")
+                        logger.info(f"✅ [PHASE 3] VERIFIED: {source_id} -> {target_id} ({kind}) | rule_id: {new_rule_id}")
                    else:
                        # WP-24c v4.5.8: Kante ablehnen (nicht zu validated_edges hinzufügen)
                        rejected_edges.append(e)
-                        logger.info(f"🚫 [VALIDATION] Kandidat abgelehnt: {source_id} --{kind}--> {target_id}")
+                        logger.info(f"🚫 [PHASE 3] REJECTED: {source_id} -> {target_id} ({kind})")
                else:
                    # WP-24c v4.5.8: Keine candidate: Kante -> direkt übernehmen
                    validated_edges.append(e)
            
+            # WP-24c v4.5.8: Phase 3 abgeschlossen - rejected_edges werden NICHT weiterverarbeitet
+            if rejected_edges:
+                logger.info(f"🚫 [PHASE 3] {len(rejected_edges)} Kanten abgelehnt und werden nicht in die DB geschrieben")
+            
            # WP-24c v4.5.8: Verwende validated_edges statt raw_edges für weitere Verarbeitung
+            # Nur verified Kanten (ohne candidate: Präfix) werden in Phase 2 (Symmetrie) verarbeitet
            explicit_edges = []
            for e in validated_edges:
                t_raw = e.get("target_id")