2026-01-12 10:53:20 +01:00
1 changed files with 37 additions and 12 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -210,6 +210,8 @@ def build_edges_for_note(
    Erzeugt und aggregiert alle Kanten für eine Note.
    WP-24c v4.2.0: Unterstützt Note-Scope Extraktions-Zonen.
    WP-24c v4.2.7: Chunk-Attribution für Callouts über candidate_pool mit explicit:callout Provenance.
+    WP-24c v4.2.9: Finalisierung der Chunk-Attribution - Synchronisation mit "Semantic First" Signal.
+                   Callout-Keys werden VOR dem globalen Scan aus candidate_pool gesammelt.
    
    Args:
        note_id: ID der Note
@ -290,9 +292,28 @@ def build_edges_for_note(
    defaults = get_edge_defaults_for(note_type, reg)
    refs_all: List[str] = []
    
-    # WP-24c v4.2.1: Sammle alle Callout-Keys aus Chunks für Smart Logic
+    # WP-24c v4.2.9: Sammle alle Callout-Keys aus Chunks für Smart Logic
+    # WICHTIG: Diese Menge muss VOR dem globalen Scan vollständig sein
    all_chunk_callout_keys: Set[Tuple[str, str, Optional[str]]] = set()

+    # WP-24c v4.2.9: PHASE 1: Sammle alle Callout-Keys aus candidate_pool VOR Text-Scan
+    # Dies stellt sicher, dass bereits geerntete Callouts nicht dupliziert werden
+    for ch in chunks:
+        cid = _get(ch, "chunk_id", "id")
+        if not cid: continue
+        
+        # B. Candidate Pool (WP-15b Validierte KI-Kanten)
+        # WP-24c v4.2.9: Sammle Callout-Keys VOR Text-Scan für Synchronisation
+        pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
+        for cand in pool:
+            raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
+            t, sec = parse_link_target(raw_t, note_id)
+            if t and p == "explicit:callout":
+                # WP-24c v4.2.9: Markiere als bereits auf Chunk-Ebene verarbeitet
+                # Dies verhindert, dass der globale Scan diese Kante als Note-Scope neu anlegt
+                all_chunk_callout_keys.add((k, t, sec))
+
+    # WP-24c v4.2.9: PHASE 2: Verarbeite Chunks und erstelle Kanten
    for ch in chunks:
        cid = _get(ch, "chunk_id", "id")
        if not cid: continue
@ -314,17 +335,12 @@ def build_edges_for_note(
            edges.append(_edge(k, "chunk", cid, t, note_id, payload))

        # B. Candidate Pool (WP-15b Validierte KI-Kanten)
-        # WP-24c v4.2.7: Sammle Callout-Keys für Chunk-Attribution
+        # WP-24c v4.2.9: Erstelle Kanten aus candidate_pool (Keys bereits in Phase 1 gesammelt)
        pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
        for cand in pool:
            raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
            t, sec = parse_link_target(raw_t, note_id)
            if t:
-                # WP-24c v4.2.7: Wenn Provenance explicit:callout, füge zu all_chunk_callout_keys hinzu
-                # Dadurch weiß die globale Extraktion, dass diese Kante bereits auf Chunk-Ebene versorgt ist
-                if p == "explicit:callout":
-                    all_chunk_callout_keys.add((k, t, sec))
-                
                # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
                payload = {
                    "chunk_id": cid, 
@ -334,14 +350,22 @@ def build_edges_for_note(
                if sec: payload["target_section"] = sec
                edges.append(_edge(k, "chunk", cid, t, note_id, payload))

-        # C. Callouts (> [!edge]) - WP-24c v4.2.1: Sammle für Smart Logic
+        # C. Callouts (> [!edge]) - WP-24c v4.2.9: Fallback für Callouts im gereinigten Text
+        # HINWEIS: Da der Text bereits gereinigt wurde (Clean-Context), werden hier typischerweise
+        # keine Callouts mehr gefunden. Falls doch, prüfe gegen all_chunk_callout_keys.
        call_pairs, rem2 = extract_callout_relations(rem)
        for k, raw_t in call_pairs:
            t, sec = parse_link_target(raw_t, note_id)
            if not t: continue
            
+            # WP-24c v4.2.9: Prüfe, ob dieser Callout bereits im candidate_pool erfasst wurde
+            callout_key = (k, t, sec)
+            if callout_key in all_chunk_callout_keys:
+                # Bereits im candidate_pool erfasst -> überspringe (wird mit chunk-Scope angelegt)
+                continue
+            
            # WP-24c v4.2.1: Tracke Callout für spätere Deduplizierung (global sammeln)
-            all_chunk_callout_keys.add((k, t, sec))
+            all_chunk_callout_keys.add(callout_key)
            
            # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
            payload = {
@ -402,14 +426,15 @@ def build_edges_for_note(
    # 4) WP-24c v4.2.0: Note-Scope Edges hinzufügen (VOR De-Duplizierung)
    edges.extend(note_scope_edges)
    
-    # 5) WP-24c v4.2.1: Callout-Extraktion aus Markdown (NACH Chunk-Verarbeitung)
-    # Smart Logic: Nur Callouts, die NICHT in Chunks vorkommen, werden mit scope: "note" angelegt
+    # 5) WP-24c v4.2.9: Callout-Extraktion aus Markdown (NACH Chunk-Verarbeitung)
+    # Deduplizierungs-Garantie: Nur Callouts, die NICHT in all_chunk_callout_keys sind,
+    # werden mit scope: "note" angelegt. Dies verhindert Duplikate für bereits geerntete Callouts.
    callout_edges_from_markdown: List[dict] = []
    if markdown_body:
        callout_edges_from_markdown = extract_callouts_from_markdown(
            markdown_body, 
            note_id,
-            existing_chunk_callouts=all_chunk_callout_keys
+            existing_chunk_callouts=all_chunk_callout_keys  # WP-24c v4.2.9: Strikte Respektierung
        )
        edges.extend(callout_edges_from_markdown)