From 38a61d7b509bbffc678108ab33048e75207faf03 Mon Sep 17 00:00:00 2001
From: Lars <Lars@stommer.de>
Date: Mon, 29 Dec 2025 12:21:57 +0100
Subject: [PATCH] Fix: Semantische Deduplizierung in graph_derive_edges.py

---
 app/core/graph/graph_derive_edges.py | 48 ++++++++++++++++++----------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/app/core/graph/graph_derive_edges.py b/app/core/graph/graph_derive_edges.py
index 1f880ff..2d20530 100644
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@@ -4,7 +4,7 @@ DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
              AUDIT: 
              - Nutzt parse_link_target
              - Übergibt Section als 'variant' an ID-Gen
-             - Dedup basiert jetzt auf Edge-ID (erlaubt Multigraph für Sections)
+             - FIXED: Semantische De-Duplizierung (ignoriert rule_id bei Konflikten)
 """
 from typing import List, Optional, Dict, Tuple
 from .graph_utils import (
@@ -21,11 +21,11 @@ def build_edges_for_note(
     note_level_references: Optional[List[str]] = None,
     include_note_scope_refs: bool = False,
 ) -> List[dict]:
-    """Erzeugt und aggregiert alle Kanten für eine Note."""
+    """Erzeugt und aggregiert alle Kanten für eine Note (WP-15b)."""
     edges: List[dict] = []
     note_type = _get(chunks[0], "type") if chunks else "concept"
 
-    # 1) Struktur-Kanten
+    # 1) Struktur-Kanten (belongs_to, next/prev)
     for idx, ch in enumerate(chunks):
         cid = _get(ch, "chunk_id", "id")
         if not cid: continue
@@ -55,21 +55,21 @@ def build_edges_for_note(
         if not cid: continue
         raw = _get(ch, "window") or _get(ch, "text") or ""
 
-        # Typed
+        # Typed & Candidate Pool (WP-15b Integration)
         typed, rem = extract_typed_relations(raw)
         for k, raw_t in typed:
             t, sec = parse_link_target(raw_t, note_id)
             if not t: continue
+            
             payload = {
                 "chunk_id": cid, 
-                # Variant=sec sorgt für eindeutige ID pro Abschnitt
                 "edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec),
                 "provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
             }
             if sec: payload["target_section"] = sec
+            
             edges.append(_edge(k, "chunk", cid, t, note_id, payload))
 
-        # Semantic AI Candidates
         pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
         for cand in pool:
             raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
@@ -81,38 +81,38 @@ def build_edges_for_note(
                     "provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
                 }
                 if sec: payload["target_section"] = sec
+                
                 edges.append(_edge(k, "chunk", cid, t, note_id, payload))
 
-        # Callouts
+        # Callouts & Wikilinks
         call_pairs, rem2 = extract_callout_relations(rem)
         for k, raw_t in call_pairs:
             t, sec = parse_link_target(raw_t, note_id)
             if not t: continue
+            
             payload = {
                 "chunk_id": cid, 
                 "edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge", variant=sec),
                 "provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
             }
             if sec: payload["target_section"] = sec
+            
             edges.append(_edge(k, "chunk", cid, t, note_id, payload))
 
-        # Wikilinks & Defaults
         refs = extract_wikilinks(rem2)
         for raw_r in refs:
             r, sec = parse_link_target(raw_r, note_id)
             if not r: continue
             
-            # Explicit Reference
             payload = {
                 "chunk_id": cid, "ref_text": raw_r, 
                 "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink", variant=sec),
                 "provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
             }
             if sec: payload["target_section"] = sec
+            
             edges.append(_edge("references", "chunk", cid, r, note_id, payload))
             
-            # Defaults (nur einmal pro Target, Section hier irrelevant für Typ-Logik, oder?)
-            # Wir erzeugen Defaults auch pro Section, um Konsistenz zu wahren.
             for rel in defaults:
                 if rel != "references":
                     def_payload = {
@@ -141,13 +141,27 @@ def build_edges_for_note(
                 "provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
             }))
 
-    # Deduplizierung: Wir nutzen jetzt die EDGE-ID als Schlüssel.
-    # Da die Edge-ID nun 'variant' (Section) enthält, bleiben unterschiedliche Sections erhalten.
+    # FIX: Semantische Deduplizierung
+    # Wir nutzen einen Key aus (Source, Target, Kind, Section), um Duplikate 
+    # aus verschiedenen Regeln (z.B. callout vs. wikilink) zusammenzuführen.
     unique_map: Dict[str, dict] = {}
+    
     for e in edges:
-        eid = e["edge_id"]
-        # Bei Konflikt (gleiche ID = exakt gleiche Kante und Section) gewinnt die höhere Confidence
-        if eid not in unique_map or e.get("confidence", 0) > unique_map[eid].get("confidence", 0):
-            unique_map[eid] = e
+        # Semantischer Schlüssel: Unabhängig von rule_id oder edge_id
+        src = e.get("source_id", "")
+        tgt = e.get("target_id", "")
+        kind = e.get("kind", "")
+        sec = e.get("target_section", "")
+        
+        sem_key = f"{src}->{tgt}:{kind}@{sec}"
+        
+        if sem_key not in unique_map:
+            unique_map[sem_key] = e
+        else:
+            # Konfliktlösung: Die Kante mit der höheren Confidence gewinnt
+            curr_conf = unique_map[sem_key].get("confidence", 0.0)
+            new_conf = e.get("confidence", 0.0)
+            if new_conf > curr_conf:
+                unique_map[sem_key] = e
                 
     return list(unique_map.values())
\ No newline at end of file