Update graph_derive_edges.py to version 4.2.2: Implement semantic de-duplication with improved scope decision-making. Enhance edge ID calculation by prioritizing semantic grouping before scope assignment, ensuring accurate edge representation across different contexts. Update documentation to reflect changes in edge processing logic and prioritization strategy.

2026-01-10 22:20:13 +01:00 · 2026-01-10 22:20:13 +01:00 · 6131b315d7
commit 6131b315d7
parent dfff46e45c
1 changed files with 89 additions and 51 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -13,7 +13,11 @@ DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
             - Konsolidierte Callout-Extraktion (keine Duplikate)
             - Smart Scope-Priorisierung (chunk bevorzugt, außer bei höherer Provenance)
             - Effiziente Verarbeitung ohne redundante Scans
-VERSION: 4.2.1 (WP-24c: Clean-Context Bereinigung)
+             WP-24c v4.2.2: Semantische De-Duplizierung
+             - Gruppierung nach (kind, source, target, section) unabhängig vom Scope
+             - Scope-Entscheidung: explicit:note_zone > chunk-Scope
+             - ID-Berechnung erst nach Scope-Entscheidung
+VERSION: 4.2.2 (WP-24c: Semantische De-Duplizierung)
 STATUS: Active
 """
 import re
@ -164,10 +168,13 @@ def extract_callouts_from_markdown(
        if not t:
            continue
        
-        # WP-24c v4.2.1: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
+        # WP-24c v4.2.2: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
+        # Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
+        # Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
        callout_key = (k, t, sec)
        if callout_key in existing_chunk_callouts:
            # Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
+            # Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
            continue
        
        # WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
@ -399,59 +406,90 @@ def build_edges_for_note(
        )
        edges.extend(callout_edges_from_markdown)

-    # 6) De-Duplizierung (In-Place) mit Priorisierung
-    # WP-24c v4.2.1: Smart Scope-Priorisierung
-    # - chunk-Scope wird bevorzugt (präzisere Information für RAG)
-    # - note-Scope gewinnt nur bei höherer Provenance-Priorität (z.B. explicit:note_zone)
-    # WP-24c v4.1.0: Da die EDGE-ID nun auf 5 Parametern basiert (inkl. target_section),
-    # bleiben Links auf unterschiedliche Abschnitte derselben Note als eigenständige 
-    # Kanten erhalten. Nur identische Sektions-Links werden nach Confidence und Provenance konsolidiert.
-    unique_map: Dict[str, dict] = {}
+    # 6) WP-24c v4.2.2: Semantische De-Duplizierung mit Scope-Entscheidung
+    # Problem: edge_id enthält Scope, daher werden semantisch identische Kanten
+    # (gleiches kind, source, target, section) mit unterschiedlichem Scope nicht erkannt.
+    # Lösung: Zuerst semantische Gruppierung, dann Scope-Entscheidung, dann ID-Berechnung.
+    
+    # Schritt 1: Semantische Gruppierung (unabhängig vom Scope)
+    # Schlüssel: (kind, source_id, target_id, target_section)
+    # Hinweis: source_id ist bei chunk-Scope die chunk_id, bei note-Scope die note_id
+    # Für semantische Gleichheit müssen wir prüfen: Ist die Quelle die gleiche Note?
+    semantic_groups: Dict[Tuple[str, str, str, Optional[str]], List[dict]] = {}
+    
    for e in edges:
-        eid = e["edge_id"]
+        kind = e.get("kind", "related_to")
+        source_id = e.get("source_id", "")
+        target_id = e.get("target_id", "")
+        target_section = e.get("target_section")
+        scope = e.get("scope", "chunk")
+        note_id_from_edge = e.get("note_id", "")
        
-        if eid not in unique_map:
-            unique_map[eid] = e
+        # WP-24c v4.2.2: Normalisiere source_id für semantische Gruppierung
+        # Bei chunk-Scope: source_id ist chunk_id, aber wir wollen nach note_id gruppieren
+        # Bei note-Scope: source_id ist bereits note_id
+        # Für semantische Gleichheit: Beide Kanten müssen von derselben Note ausgehen
+        if scope == "chunk":
+            # Bei chunk-Scope: source_id ist chunk_id, aber note_id ist im Edge vorhanden
+            # Wir verwenden note_id als semantische Quelle
+            semantic_source = note_id_from_edge
        else:
-            existing = unique_map[eid]
-            existing_scope = existing.get("scope", "chunk")
-            new_scope = e.get("scope", "chunk")
-            existing_prov = existing.get("provenance", "")
-            new_prov = e.get("provenance", "")
+            # Bei note-Scope: source_id ist bereits note_id
+            semantic_source = source_id
+        
+        # Semantischer Schlüssel: (kind, semantic_source, target_id, target_section)
+        semantic_key = (kind, semantic_source, target_id, target_section)
+        
+        if semantic_key not in semantic_groups:
+            semantic_groups[semantic_key] = []
+        semantic_groups[semantic_key].append(e)
+    
+    # Schritt 2: Scope-Entscheidung pro semantischer Gruppe
+    # Schritt 3: ID-Zuweisung nach Scope-Entscheidung
+    final_edges: List[dict] = []
+    
+    for semantic_key, group in semantic_groups.items():
+        if len(group) == 1:
+            # Nur eine Kante: Direkt verwenden, aber ID neu berechnen mit finalem Scope
+            winner = group[0]
+            final_scope = winner.get("scope", "chunk")
+            final_source = winner.get("source_id", "")
+            kind, semantic_source, target_id, target_section = semantic_key
            
-            # WP-24c v4.2.1: Scope-Priorisierung
-            # 1. explicit:note_zone hat höchste Priorität (unabhängig von Scope)
-            is_existing_note_zone = existing_prov == "explicit:note_zone"
-            is_new_note_zone = new_prov == "explicit:note_zone"
+            # WP-24c v4.2.2: Berechne edge_id mit finalem Scope
+            final_edge_id = _mk_edge_id(kind, final_source, target_id, final_scope, target_section=target_section)
+            winner["edge_id"] = final_edge_id
+            final_edges.append(winner)
+        else:
+            # Mehrere Kanten mit gleichem semantischen Schlüssel: Scope-Entscheidung
+            winner = None
            
-            if is_new_note_zone and not is_existing_note_zone:
-                # Neuer Link ist Note-Scope Zone -> ersetze
-                unique_map[eid] = e
-            elif is_existing_note_zone and not is_new_note_zone:
-                # Bestehender Link ist Note-Scope Zone -> behalte
-                pass
+            # Regel 1: explicit:note_zone hat höchste Priorität
+            note_zone_candidates = [e for e in group if e.get("provenance") == "explicit:note_zone"]
+            if note_zone_candidates:
+                # Wenn mehrere note_zone: Nimm die mit höchster Confidence
+                winner = max(note_zone_candidates, key=lambda e: e.get("confidence", 0))
            else:
-                # 2. chunk-Scope bevorzugen (präzisere Information)
-                if existing_scope == "chunk" and new_scope == "note":
-                    # Bestehender chunk-Scope -> behalte
-                    pass
-                elif existing_scope == "note" and new_scope == "chunk":
-                    # Neuer chunk-Scope -> ersetze (präziser)
-                    unique_map[eid] = e
+                # Regel 2: chunk-Scope bevorzugen (Präzisions-Vorteil)
+                chunk_candidates = [e for e in group if e.get("scope") == "chunk"]
+                if chunk_candidates:
+                    # Wenn mehrere chunk: Nimm die mit höchster Confidence * Priority
+                    winner = max(chunk_candidates, key=lambda e: (
+                        e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
+                    ))
                else:
-                    # Gleicher Scope -> vergleiche Confidence und Provenance-Priority
-                    existing_conf = existing.get("confidence", 0)
-                    new_conf = e.get("confidence", 0)
-                    
-                    # Provenance-Priority berücksichtigen
-                    existing_priority = PROVENANCE_PRIORITY.get(existing_prov, 0.7)
-                    new_priority = PROVENANCE_PRIORITY.get(new_prov, 0.7)
-                    
-                    # Kombinierter Score: Confidence * Priority
-                    existing_score = existing_conf * existing_priority
-                    new_score = new_conf * new_priority
-                    
-                    if new_score > existing_score:
-                        unique_map[eid] = e
-                
-    return list(unique_map.values())
+                    # Regel 3: Fallback: Höchste Confidence * Priority
+                    winner = max(group, key=lambda e: (
+                        e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
+                    ))
+            
+            # WP-24c v4.2.2: Berechne edge_id mit finalem Scope
+            final_scope = winner.get("scope", "chunk")
+            final_source = winner.get("source_id", "")
+            kind, semantic_source, target_id, target_section = semantic_key
+            
+            final_edge_id = _mk_edge_id(kind, final_source, target_id, final_scope, target_section=target_section)
+            winner["edge_id"] = final_edge_id
+            final_edges.append(winner)
+    
+    return final_edges