2026-01-12 10:53:20 +01:00
2 changed files with 31 additions and 15 deletions
--- a/app/core/chunking/chunking_parser.py
+++ b/app/core/chunking/chunking_parser.py
@ -206,6 +206,8 @@ def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
    """
    Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.
    WP-24c v4.2.7: Gibt Liste von Dicts zurück mit is_callout Flag für Chunk-Attribution.
+    WP-24c v4.2.9 Fix A: current_edge_type bleibt über Leerzeilen hinweg erhalten,
+    damit alle Links in einem Callout-Block korrekt verarbeitet werden.
    
    Returns:
        List[Dict] mit keys: "edge" (str: "kind:target"), "is_callout" (bool)
@ -234,11 +236,16 @@ def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
                    found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
            continue
        # Links in Folgezeilen des Callouts
+        # WP-24c v4.2.9 Fix A: current_edge_type bleibt über Leerzeilen hinweg erhalten
+        # innerhalb eines Callout-Blocks, damit alle Links korrekt verarbeitet werden
        if current_edge_type and stripped.startswith('>'):
+            # Fortsetzung des Callout-Blocks: Links extrahieren
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links: 
                if "rel:" not in l:
                    found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
-        elif not stripped.startswith('>'): 
+        elif current_edge_type and not stripped.startswith('>') and stripped:
+            # Nicht-Callout-Zeile mit Inhalt: Callout-Block beendet
            current_edge_type = None
+        # Leerzeilen werden ignoriert - current_edge_type bleibt erhalten
    return found_edges
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -212,6 +212,9 @@ def build_edges_for_note(
    WP-24c v4.2.7: Chunk-Attribution für Callouts über candidate_pool mit explicit:callout Provenance.
    WP-24c v4.2.9: Finalisierung der Chunk-Attribution - Synchronisation mit "Semantic First" Signal.
                   Callout-Keys werden VOR dem globalen Scan aus candidate_pool gesammelt.
+    WP-24c v4.2.9 Fix B: Zwei-Phasen-Synchronisation für Chunk-Autorität.
+                        Phase 1: Sammle alle explicit:callout Keys VOR Text-Scan.
+                        Phase 2: Globaler Scan respektiert all_chunk_callout_keys als Ausschlusskriterium.
    
    Args:
        note_id: ID der Note
@ -292,26 +295,31 @@ def build_edges_for_note(
    defaults = get_edge_defaults_for(note_type, reg)
    refs_all: List[str] = []
    
-    # WP-24c v4.2.9: Sammle alle Callout-Keys aus Chunks für Smart Logic
+    # WP-24c v4.2.9 Fix B: Zwei-Phasen-Synchronisation für Chunk-Autorität
    # WICHTIG: Diese Menge muss VOR dem globalen Scan vollständig sein
    all_chunk_callout_keys: Set[Tuple[str, str, Optional[str]]] = set()

-    # WP-24c v4.2.9: PHASE 1: Sammle alle Callout-Keys aus candidate_pool VOR Text-Scan
+    # PHASE 1 (Sicherung der Chunk-Autorität): Sammle alle Callout-Keys aus candidate_pool
+    # BEVOR der globale Markdown-Scan oder der Loop über die Chunks beginnt
    # Dies stellt sicher, dass bereits geerntete Callouts nicht dupliziert werden
    for ch in chunks:
        cid = _get(ch, "chunk_id", "id")
        if not cid: continue
        
-        # B. Candidate Pool (WP-15b Validierte KI-Kanten)
-        # WP-24c v4.2.9: Sammle Callout-Keys VOR Text-Scan für Synchronisation
+        # Iteriere durch candidate_pool und sammle explicit:callout Kanten
        pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
        for cand in pool:
-            raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
-            t, sec = parse_link_target(raw_t, note_id)
-            if t and p == "explicit:callout":
-                # WP-24c v4.2.9: Markiere als bereits auf Chunk-Ebene verarbeitet
-                # Dies verhindert, dass der globale Scan diese Kante als Note-Scope neu anlegt
-                all_chunk_callout_keys.add((k, t, sec))
+            raw_t = cand.get("to")
+            k = cand.get("kind", "related_to")
+            p = cand.get("provenance", "semantic_ai")
+            
+            # WP-24c v4.2.9 Fix B: Wenn Provenance explicit:callout, extrahiere Key
+            if p == "explicit:callout":
+                t, sec = parse_link_target(raw_t, note_id)
+                if t:
+                    # Key-Format: (kind, target, section) für Multigraph-Präzision
+                    # Dies verhindert, dass der globale Scan diese Kante als Note-Scope neu anlegt
+                    all_chunk_callout_keys.add((k, t, sec))

    # WP-24c v4.2.9: PHASE 2: Verarbeite Chunks und erstelle Kanten
    for ch in chunks:
@ -426,15 +434,16 @@ def build_edges_for_note(
    # 4) WP-24c v4.2.0: Note-Scope Edges hinzufügen (VOR De-Duplizierung)
    edges.extend(note_scope_edges)
    
-    # 5) WP-24c v4.2.9: Callout-Extraktion aus Markdown (NACH Chunk-Verarbeitung)
-    # Deduplizierungs-Garantie: Nur Callouts, die NICHT in all_chunk_callout_keys sind,
-    # werden mit scope: "note" angelegt. Dies verhindert Duplikate für bereits geerntete Callouts.
+    # 5) WP-24c v4.2.9 Fix B PHASE 2 (Deduplizierung): Callout-Extraktion aus Markdown
+    # Der globale Scan des markdown_body nutzt all_chunk_callout_keys als Ausschlusskriterium.
+    # Callouts, die bereits in Phase 1 als Chunk-Kanten identifiziert wurden,
+    # dürfen nicht erneut als Note-Scope Kanten angelegt werden.
    callout_edges_from_markdown: List[dict] = []
    if markdown_body:
        callout_edges_from_markdown = extract_callouts_from_markdown(
            markdown_body, 
            note_id,
-            existing_chunk_callouts=all_chunk_callout_keys  # WP-24c v4.2.9: Strikte Respektierung
+            existing_chunk_callouts=all_chunk_callout_keys  # WP-24c v4.2.9 Fix B: Strikte Respektierung
        )
        edges.extend(callout_edges_from_markdown)