Refine edge parsing and chunk attribution in chunking_parser.py and graph_derive_edges.py for version 4.2.9: Ensure current_edge_type persists across empty lines in callout blocks for accurate link processing. Implement two-phase synchronization for chunk authority, collecting explicit callout keys before the global scan to prevent duplicates. Enhance callout extraction logic to respect existing chunk callouts, improving deduplication and processing efficiency.

2026-01-11 14:30:16 +01:00 · 2026-01-11 14:30:16 +01:00 · 727de50290
commit 727de50290
parent a780104b3c
2 changed files with 31 additions and 15 deletions
--- a/app/core/chunking/chunking_parser.py
+++ b/app/core/chunking/chunking_parser.py
@ -206,6 +206,8 @@ def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
    """
    Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.
    WP-24c v4.2.7: Gibt Liste von Dicts zurück mit is_callout Flag für Chunk-Attribution.
    WP-24c v4.2.9 Fix A: current_edge_type bleibt über Leerzeilen hinweg erhalten,
    damit alle Links in einem Callout-Block korrekt verarbeitet werden.
    Returns:
        List[Dict] mit keys: "edge" (str: "kind:target"), "is_callout" (bool)
@ -234,11 +236,16 @@ def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
                    found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
            continue
        # Links in Folgezeilen des Callouts
        # WP-24c v4.2.9 Fix A: current_edge_type bleibt über Leerzeilen hinweg erhalten
        # innerhalb eines Callout-Blocks, damit alle Links korrekt verarbeitet werden
        if current_edge_type and stripped.startswith('>'):
            # Fortsetzung des Callout-Blocks: Links extrahieren
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links: 
                if "rel:" not in l:
                    found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
-        elif not stripped.startswith('>'): 
+        elif current_edge_type and not stripped.startswith('>') and stripped:
            # Nicht-Callout-Zeile mit Inhalt: Callout-Block beendet
            current_edge_type = None
        # Leerzeilen werden ignoriert - current_edge_type bleibt erhalten
    return found_edges
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -212,6 +212,9 @@ def build_edges_for_note(
    WP-24c v4.2.7: Chunk-Attribution für Callouts über candidate_pool mit explicit:callout Provenance.
    WP-24c v4.2.9: Finalisierung der Chunk-Attribution - Synchronisation mit "Semantic First" Signal.
                   Callout-Keys werden VOR dem globalen Scan aus candidate_pool gesammelt.
    WP-24c v4.2.9 Fix B: Zwei-Phasen-Synchronisation für Chunk-Autorität.
                        Phase 1: Sammle alle explicit:callout Keys VOR Text-Scan.
                        Phase 2: Globaler Scan respektiert all_chunk_callout_keys als Ausschlusskriterium.
    Args:
        note_id: ID der Note
@ -292,26 +295,31 @@ def build_edges_for_note(
    defaults = get_edge_defaults_for(note_type, reg)
    refs_all: List[str] = []
-    # WP-24c v4.2.9: Sammle alle Callout-Keys aus Chunks für Smart Logic
+    # WP-24c v4.2.9 Fix B: Zwei-Phasen-Synchronisation für Chunk-Autorität
    # WICHTIG: Diese Menge muss VOR dem globalen Scan vollständig sein
    all_chunk_callout_keys: Set[Tuple[str, str, Optional[str]]] = set()
-    # WP-24c v4.2.9: PHASE 1: Sammle alle Callout-Keys aus candidate_pool VOR Text-Scan
+    # PHASE 1 (Sicherung der Chunk-Autorität): Sammle alle Callout-Keys aus candidate_pool
    # BEVOR der globale Markdown-Scan oder der Loop über die Chunks beginnt
    # Dies stellt sicher, dass bereits geerntete Callouts nicht dupliziert werden
    for ch in chunks:
        cid = _get(ch, "chunk_id", "id")
        if not cid: continue
-        # B. Candidate Pool (WP-15b Validierte KI-Kanten)
+        # Iteriere durch candidate_pool und sammle explicit:callout Kanten
        # WP-24c v4.2.9: Sammle Callout-Keys VOR Text-Scan für Synchronisation
        pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
        for cand in pool:
-            raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
+            raw_t = cand.get("to")
-            t, sec = parse_link_target(raw_t, note_id)
+            k = cand.get("kind", "related_to")
-            if t and p == "explicit:callout":
+            p = cand.get("provenance", "semantic_ai")
-                # WP-24c v4.2.9: Markiere als bereits auf Chunk-Ebene verarbeitet
+            
-                # Dies verhindert, dass der globale Scan diese Kante als Note-Scope neu anlegt
+            # WP-24c v4.2.9 Fix B: Wenn Provenance explicit:callout, extrahiere Key
-                all_chunk_callout_keys.add((k, t, sec))
+            if p == "explicit:callout":
                t, sec = parse_link_target(raw_t, note_id)
                if t:
                    # Key-Format: (kind, target, section) für Multigraph-Präzision
                    # Dies verhindert, dass der globale Scan diese Kante als Note-Scope neu anlegt
                    all_chunk_callout_keys.add((k, t, sec))
    # WP-24c v4.2.9: PHASE 2: Verarbeite Chunks und erstelle Kanten
    for ch in chunks:
@ -426,15 +434,16 @@ def build_edges_for_note(
    # 4) WP-24c v4.2.0: Note-Scope Edges hinzufügen (VOR De-Duplizierung)
    edges.extend(note_scope_edges)
-    # 5) WP-24c v4.2.9: Callout-Extraktion aus Markdown (NACH Chunk-Verarbeitung)
+    # 5) WP-24c v4.2.9 Fix B PHASE 2 (Deduplizierung): Callout-Extraktion aus Markdown
-    # Deduplizierungs-Garantie: Nur Callouts, die NICHT in all_chunk_callout_keys sind,
+    # Der globale Scan des markdown_body nutzt all_chunk_callout_keys als Ausschlusskriterium.
-    # werden mit scope: "note" angelegt. Dies verhindert Duplikate für bereits geerntete Callouts.
+    # Callouts, die bereits in Phase 1 als Chunk-Kanten identifiziert wurden,
    # dürfen nicht erneut als Note-Scope Kanten angelegt werden.
    callout_edges_from_markdown: List[dict] = []
    if markdown_body:
        callout_edges_from_markdown = extract_callouts_from_markdown(
            markdown_body, 
            note_id,
-            existing_chunk_callouts=all_chunk_callout_keys  # WP-24c v4.2.9: Strikte Respektierung
+            existing_chunk_callouts=all_chunk_callout_keys  # WP-24c v4.2.9 Fix B: Strikte Respektierung
        )
        edges.extend(callout_edges_from_markdown)