2026-01-12 10:53:20 +01:00
1 changed files with 89 additions and 51 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -13,7 +13,11 @@ DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
             - Konsolidierte Callout-Extraktion (keine Duplikate)
             - Smart Scope-Priorisierung (chunk bevorzugt, außer bei höherer Provenance)
             - Effiziente Verarbeitung ohne redundante Scans
-VERSION: 4.2.1 (WP-24c: Clean-Context Bereinigung)
+             WP-24c v4.2.2: Semantische De-Duplizierung
+             - Gruppierung nach (kind, source, target, section) unabhängig vom Scope
+             - Scope-Entscheidung: explicit:note_zone > chunk-Scope
+             - ID-Berechnung erst nach Scope-Entscheidung
+VERSION: 4.2.2 (WP-24c: Semantische De-Duplizierung)
 STATUS: Active
 """
 import re
@ -164,10 +168,13 @@ def extract_callouts_from_markdown(
        if not t:
            continue
        
-        # WP-24c v4.2.1: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
+        # WP-24c v4.2.2: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
+        # Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
+        # Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
        callout_key = (k, t, sec)
        if callout_key in existing_chunk_callouts:
            # Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
+            # Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
            continue
        
        # WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
@ -399,59 +406,90 @@ def build_edges_for_note(
        )
        edges.extend(callout_edges_from_markdown)

-    # 6) De-Duplizierung (In-Place) mit Priorisierung
-    # WP-24c v4.2.1: Smart Scope-Priorisierung
-    # - chunk-Scope wird bevorzugt (präzisere Information für RAG)
-    # - note-Scope gewinnt nur bei höherer Provenance-Priorität (z.B. explicit:note_zone)
-    # WP-24c v4.1.0: Da die EDGE-ID nun auf 5 Parametern basiert (inkl. target_section),
-    # bleiben Links auf unterschiedliche Abschnitte derselben Note als eigenständige 
-    # Kanten erhalten. Nur identische Sektions-Links werden nach Confidence und Provenance konsolidiert.
-    unique_map: Dict[str, dict] = {}
+    # 6) WP-24c v4.2.2: Semantische De-Duplizierung mit Scope-Entscheidung
+    # Problem: edge_id enthält Scope, daher werden semantisch identische Kanten
+    # (gleiches kind, source, target, section) mit unterschiedlichem Scope nicht erkannt.
+    # Lösung: Zuerst semantische Gruppierung, dann Scope-Entscheidung, dann ID-Berechnung.
+    
+    # Schritt 1: Semantische Gruppierung (unabhängig vom Scope)
+    # Schlüssel: (kind, source_id, target_id, target_section)
+    # Hinweis: source_id ist bei chunk-Scope die chunk_id, bei note-Scope die note_id
+    # Für semantische Gleichheit müssen wir prüfen: Ist die Quelle die gleiche Note?
+    semantic_groups: Dict[Tuple[str, str, str, Optional[str]], List[dict]] = {}
+    
    for e in edges:
-        eid = e["edge_id"]
+        kind = e.get("kind", "related_to")
+        source_id = e.get("source_id", "")
+        target_id = e.get("target_id", "")
+        target_section = e.get("target_section")
+        scope = e.get("scope", "chunk")
+        note_id_from_edge = e.get("note_id", "")
        
-        if eid not in unique_map:
-            unique_map[eid] = e
+        # WP-24c v4.2.2: Normalisiere source_id für semantische Gruppierung
+        # Bei chunk-Scope: source_id ist chunk_id, aber wir wollen nach note_id gruppieren
+        # Bei note-Scope: source_id ist bereits note_id
+        # Für semantische Gleichheit: Beide Kanten müssen von derselben Note ausgehen
+        if scope == "chunk":
+            # Bei chunk-Scope: source_id ist chunk_id, aber note_id ist im Edge vorhanden
+            # Wir verwenden note_id als semantische Quelle
+            semantic_source = note_id_from_edge
        else:
-            existing = unique_map[eid]
-            existing_scope = existing.get("scope", "chunk")
-            new_scope = e.get("scope", "chunk")
-            existing_prov = existing.get("provenance", "")
-            new_prov = e.get("provenance", "")
+            # Bei note-Scope: source_id ist bereits note_id
+            semantic_source = source_id
+        
+        # Semantischer Schlüssel: (kind, semantic_source, target_id, target_section)
+        semantic_key = (kind, semantic_source, target_id, target_section)
+        
+        if semantic_key not in semantic_groups:
+            semantic_groups[semantic_key] = []
+        semantic_groups[semantic_key].append(e)
+    
+    # Schritt 2: Scope-Entscheidung pro semantischer Gruppe
+    # Schritt 3: ID-Zuweisung nach Scope-Entscheidung
+    final_edges: List[dict] = []
+    
+    for semantic_key, group in semantic_groups.items():
+        if len(group) == 1:
+            # Nur eine Kante: Direkt verwenden, aber ID neu berechnen mit finalem Scope
+            winner = group[0]
+            final_scope = winner.get("scope", "chunk")
+            final_source = winner.get("source_id", "")
+            kind, semantic_source, target_id, target_section = semantic_key
            
-            # WP-24c v4.2.1: Scope-Priorisierung
-            # 1. explicit:note_zone hat höchste Priorität (unabhängig von Scope)
-            is_existing_note_zone = existing_prov == "explicit:note_zone"
-            is_new_note_zone = new_prov == "explicit:note_zone"
+            # WP-24c v4.2.2: Berechne edge_id mit finalem Scope
+            final_edge_id = _mk_edge_id(kind, final_source, target_id, final_scope, target_section=target_section)
+            winner["edge_id"] = final_edge_id
+            final_edges.append(winner)
+        else:
+            # Mehrere Kanten mit gleichem semantischen Schlüssel: Scope-Entscheidung
+            winner = None
            
-            if is_new_note_zone and not is_existing_note_zone:
-                # Neuer Link ist Note-Scope Zone -> ersetze
-                unique_map[eid] = e
-            elif is_existing_note_zone and not is_new_note_zone:
-                # Bestehender Link ist Note-Scope Zone -> behalte
-                pass
+            # Regel 1: explicit:note_zone hat höchste Priorität
+            note_zone_candidates = [e for e in group if e.get("provenance") == "explicit:note_zone"]
+            if note_zone_candidates:
+                # Wenn mehrere note_zone: Nimm die mit höchster Confidence
+                winner = max(note_zone_candidates, key=lambda e: e.get("confidence", 0))
            else:
-                # 2. chunk-Scope bevorzugen (präzisere Information)
-                if existing_scope == "chunk" and new_scope == "note":
-                    # Bestehender chunk-Scope -> behalte
-                    pass
-                elif existing_scope == "note" and new_scope == "chunk":
-                    # Neuer chunk-Scope -> ersetze (präziser)
-                    unique_map[eid] = e
+                # Regel 2: chunk-Scope bevorzugen (Präzisions-Vorteil)
+                chunk_candidates = [e for e in group if e.get("scope") == "chunk"]
+                if chunk_candidates:
+                    # Wenn mehrere chunk: Nimm die mit höchster Confidence * Priority
+                    winner = max(chunk_candidates, key=lambda e: (
+                        e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
+                    ))
                else:
-                    # Gleicher Scope -> vergleiche Confidence und Provenance-Priority
-                    existing_conf = existing.get("confidence", 0)
-                    new_conf = e.get("confidence", 0)
-                    
-                    # Provenance-Priority berücksichtigen
-                    existing_priority = PROVENANCE_PRIORITY.get(existing_prov, 0.7)
-                    new_priority = PROVENANCE_PRIORITY.get(new_prov, 0.7)
-                    
-                    # Kombinierter Score: Confidence * Priority
-                    existing_score = existing_conf * existing_priority
-                    new_score = new_conf * new_priority
-                    
-                    if new_score > existing_score:
-                        unique_map[eid] = e
-                
-    return list(unique_map.values())
+                    # Regel 3: Fallback: Höchste Confidence * Priority
+                    winner = max(group, key=lambda e: (
+                        e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
+                    ))
+            
+            # WP-24c v4.2.2: Berechne edge_id mit finalem Scope
+            final_scope = winner.get("scope", "chunk")
+            final_source = winner.get("source_id", "")
+            kind, semantic_source, target_id, target_section = semantic_key
+            
+            final_edge_id = _mk_edge_id(kind, final_source, target_id, final_scope, target_section=target_section)
+            winner["edge_id"] = final_edge_id
+            final_edges.append(winner)
+    
+    return final_edges