Update graph_derive_edges.py to version 4.2.2: Implement semantic de-duplication with improved scope decision-making. Enhance edge ID calculation by prioritizing semantic grouping before scope assignment, ensuring accurate edge representation across different contexts. Update documentation to reflect changes in edge processing logic and prioritization strategy.

2026-01-10 22:20:13 +01:00 · 2026-01-10 22:20:13 +01:00 · 6131b315d7
commit 6131b315d7
parent dfff46e45c
1 changed files with 89 additions and 51 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -13,7 +13,11 @@ DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
             - Konsolidierte Callout-Extraktion (keine Duplikate)
             - Smart Scope-Priorisierung (chunk bevorzugt, außer bei höherer Provenance)
             - Effiziente Verarbeitung ohne redundante Scans
-VERSION: 4.2.1 (WP-24c: Clean-Context Bereinigung)
+             WP-24c v4.2.2: Semantische De-Duplizierung
             - Gruppierung nach (kind, source, target, section) unabhängig vom Scope
             - Scope-Entscheidung: explicit:note_zone > chunk-Scope
             - ID-Berechnung erst nach Scope-Entscheidung
 VERSION: 4.2.2 (WP-24c: Semantische De-Duplizierung)
 STATUS: Active
 """
 import re
@ -164,10 +168,13 @@ def extract_callouts_from_markdown(
        if not t:
            continue
-        # WP-24c v4.2.1: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
+        # WP-24c v4.2.2: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
        # Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
        # Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
        callout_key = (k, t, sec)
        if callout_key in existing_chunk_callouts:
            # Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
            # Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
            continue
        # WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
@ -399,59 +406,90 @@ def build_edges_for_note(
        )
        edges.extend(callout_edges_from_markdown)
-    # 6) De-Duplizierung (In-Place) mit Priorisierung
+    # 6) WP-24c v4.2.2: Semantische De-Duplizierung mit Scope-Entscheidung
-    # WP-24c v4.2.1: Smart Scope-Priorisierung
+    # Problem: edge_id enthält Scope, daher werden semantisch identische Kanten
-    # - chunk-Scope wird bevorzugt (präzisere Information für RAG)
+    # (gleiches kind, source, target, section) mit unterschiedlichem Scope nicht erkannt.
-    # - note-Scope gewinnt nur bei höherer Provenance-Priorität (z.B. explicit:note_zone)
+    # Lösung: Zuerst semantische Gruppierung, dann Scope-Entscheidung, dann ID-Berechnung.
-    # WP-24c v4.1.0: Da die EDGE-ID nun auf 5 Parametern basiert (inkl. target_section),
+    
-    # bleiben Links auf unterschiedliche Abschnitte derselben Note als eigenständige 
+    # Schritt 1: Semantische Gruppierung (unabhängig vom Scope)
-    # Kanten erhalten. Nur identische Sektions-Links werden nach Confidence und Provenance konsolidiert.
+    # Schlüssel: (kind, source_id, target_id, target_section)
-    unique_map: Dict[str, dict] = {}
+    # Hinweis: source_id ist bei chunk-Scope die chunk_id, bei note-Scope die note_id
    # Für semantische Gleichheit müssen wir prüfen: Ist die Quelle die gleiche Note?
    semantic_groups: Dict[Tuple[str, str, str, Optional[str]], List[dict]] = {}
    for e in edges:
-        eid = e["edge_id"]
+        kind = e.get("kind", "related_to")
        source_id = e.get("source_id", "")
        target_id = e.get("target_id", "")
        target_section = e.get("target_section")
        scope = e.get("scope", "chunk")
        note_id_from_edge = e.get("note_id", "")
-        if eid not in unique_map:
+        # WP-24c v4.2.2: Normalisiere source_id für semantische Gruppierung
-            unique_map[eid] = e
+        # Bei chunk-Scope: source_id ist chunk_id, aber wir wollen nach note_id gruppieren
        # Bei note-Scope: source_id ist bereits note_id
        # Für semantische Gleichheit: Beide Kanten müssen von derselben Note ausgehen
        if scope == "chunk":
            # Bei chunk-Scope: source_id ist chunk_id, aber note_id ist im Edge vorhanden
            # Wir verwenden note_id als semantische Quelle
            semantic_source = note_id_from_edge
        else:
-            existing = unique_map[eid]
+            # Bei note-Scope: source_id ist bereits note_id
-            existing_scope = existing.get("scope", "chunk")
+            semantic_source = source_id
            new_scope = e.get("scope", "chunk")
            existing_prov = existing.get("provenance", "")
            new_prov = e.get("provenance", "")
-            # WP-24c v4.2.1: Scope-Priorisierung
+        # Semantischer Schlüssel: (kind, semantic_source, target_id, target_section)
-            # 1. explicit:note_zone hat höchste Priorität (unabhängig von Scope)
+        semantic_key = (kind, semantic_source, target_id, target_section)
            is_existing_note_zone = existing_prov == "explicit:note_zone"
            is_new_note_zone = new_prov == "explicit:note_zone"
-            if is_new_note_zone and not is_existing_note_zone:
+        if semantic_key not in semantic_groups:
-                # Neuer Link ist Note-Scope Zone -> ersetze
+            semantic_groups[semantic_key] = []
-                unique_map[eid] = e
+        semantic_groups[semantic_key].append(e)
-            elif is_existing_note_zone and not is_new_note_zone:
+    
-                # Bestehender Link ist Note-Scope Zone -> behalte
+    # Schritt 2: Scope-Entscheidung pro semantischer Gruppe
-                pass
+    # Schritt 3: ID-Zuweisung nach Scope-Entscheidung
    final_edges: List[dict] = []
    for semantic_key, group in semantic_groups.items():
        if len(group) == 1:
            # Nur eine Kante: Direkt verwenden, aber ID neu berechnen mit finalem Scope
            winner = group[0]
            final_scope = winner.get("scope", "chunk")
            final_source = winner.get("source_id", "")
            kind, semantic_source, target_id, target_section = semantic_key
            # WP-24c v4.2.2: Berechne edge_id mit finalem Scope
            final_edge_id = _mk_edge_id(kind, final_source, target_id, final_scope, target_section=target_section)
            winner["edge_id"] = final_edge_id
            final_edges.append(winner)
        else:
-                # 2. chunk-Scope bevorzugen (präzisere Information)
+            # Mehrere Kanten mit gleichem semantischen Schlüssel: Scope-Entscheidung
-                if existing_scope == "chunk" and new_scope == "note":
+            winner = None
-                    # Bestehender chunk-Scope -> behalte
+            
-                    pass
+            # Regel 1: explicit:note_zone hat höchste Priorität
-                elif existing_scope == "note" and new_scope == "chunk":
+            note_zone_candidates = [e for e in group if e.get("provenance") == "explicit:note_zone"]
-                    # Neuer chunk-Scope -> ersetze (präziser)
+            if note_zone_candidates:
-                    unique_map[eid] = e
+                # Wenn mehrere note_zone: Nimm die mit höchster Confidence
                winner = max(note_zone_candidates, key=lambda e: e.get("confidence", 0))
            else:
-                    # Gleicher Scope -> vergleiche Confidence und Provenance-Priority
+                # Regel 2: chunk-Scope bevorzugen (Präzisions-Vorteil)
-                    existing_conf = existing.get("confidence", 0)
+                chunk_candidates = [e for e in group if e.get("scope") == "chunk"]
-                    new_conf = e.get("confidence", 0)
+                if chunk_candidates:
                    # Wenn mehrere chunk: Nimm die mit höchster Confidence * Priority
                    winner = max(chunk_candidates, key=lambda e: (
                        e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
                    ))
                else:
                    # Regel 3: Fallback: Höchste Confidence * Priority
                    winner = max(group, key=lambda e: (
                        e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
                    ))
-                    # Provenance-Priority berücksichtigen
+            # WP-24c v4.2.2: Berechne edge_id mit finalem Scope
-                    existing_priority = PROVENANCE_PRIORITY.get(existing_prov, 0.7)
+            final_scope = winner.get("scope", "chunk")
-                    new_priority = PROVENANCE_PRIORITY.get(new_prov, 0.7)
+            final_source = winner.get("source_id", "")
            kind, semantic_source, target_id, target_section = semantic_key
-                    # Kombinierter Score: Confidence * Priority
+            final_edge_id = _mk_edge_id(kind, final_source, target_id, final_scope, target_section=target_section)
-                    existing_score = existing_conf * existing_priority
+            winner["edge_id"] = final_edge_id
-                    new_score = new_conf * new_priority
+            final_edges.append(winner)
-                    if new_score > existing_score:
+    return final_edges
                        unique_map[eid] = e
    return list(unique_map.values())