2026-01-12 10:53:20 +01:00
1 changed files with 202 additions and 41 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -208,8 +208,9 @@ def extract_llm_validation_zones(markdown_body: str) -> List[Tuple[str, str]]:
    llm_validation_headers = get_llm_validation_zone_headers()
    
    for i, line in enumerate(lines):
-        # Prüfe auf Header
+        # Prüfe auf Header (konfiguriertes Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL)
        header_match = re.match(header_pattern, line.strip())
+        
        if header_match:
            header_text = header_match.group(1).strip()
            
@ -266,11 +267,16 @@ def extract_callouts_from_markdown(
 ) -> List[dict]:
    """
    WP-24c v4.2.1: Extrahiert Callouts aus dem Original-Markdown.
+    WP-24c v4.5.6: Header-Status-Maschine für korrekte Zonen-Erkennung.
    
    Smart Logic: Nur Callouts, die NICHT in Chunks vorkommen (z.B. in Edge-Zonen),
    werden mit scope: "note" angelegt. Callouts, die bereits in Chunks erfasst wurden,
    werden übersprungen, um Duplikate zu vermeiden.
    
+    WP-24c v4.5.6: Prüft für jeden Callout, ob er in einer LLM-Validierungs-Zone liegt.
+    - In LLM-Validierungs-Zone: rule_id = "candidate:explicit:callout"
+    - In Standard-Zone: rule_id = "explicit:callout" (ohne candidate:)
+    
    Args:
        markdown_body: Original-Markdown-Text (vor Chunking-Filterung)
        note_id: ID der Note
@ -287,52 +293,207 @@ def extract_callouts_from_markdown(
    
    edges: List[dict] = []
    
-    # Extrahiere alle Callouts aus dem gesamten Markdown
-    call_pairs, _ = extract_callout_relations(markdown_body)
+    # WP-24c v4.5.6: Header-Status-Maschine - Baue Mapping von Zeilen zu Zonen-Status
+    import os
+    import re
    
-    for k, raw_t in call_pairs:
-        t, sec = parse_link_target(raw_t, note_id)
-        if not t:
+    llm_validation_headers = get_llm_validation_zone_headers()
+    llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
+    # WP-24c v4.5.6: Konfigurierbare Header-Ebene (vollständig über .env steuerbar)
+    header_level_pattern = "#" * llm_validation_level
+    header_pattern = rf'^{re.escape(header_level_pattern)}\s+(.+?)$'
+    
+    lines = markdown_body.split('\n')
+    current_zone_is_llm_validation = False
+    
+    # WP-24c v4.5.6: Zeile-für-Zeile Verarbeitung mit Zonen-Tracking
+    # Extrahiere Callouts direkt während des Durchlaufs, um Zonen-Kontext zu behalten
+    current_kind = None
+    in_callout_block = False
+    callout_block_lines = []  # Sammle Zeilen eines Callout-Blocks
+    
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+        
+        # WP-24c v4.5.6: Prüfe auf Header (Zonen-Wechsel)
+        # Verwendet das konfigurierte Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL
+        header_match = re.match(header_pattern, stripped)
+        
+        if header_match:
+            header_text = header_match.group(1).strip()
+            # Prüfe, ob dieser Header eine LLM-Validierungs-Zone startet
+            # WP-24c v4.5.6: Header-Status-Maschine - korrekte Zonen-Erkennung
+            current_zone_is_llm_validation = any(
+                header_text.lower() == llm_header.lower()
+                for llm_header in llm_validation_headers
+            )
+            logger.debug(f"DEBUG-TRACER [Zone-Change]: Header '{header_text}' (Level {llm_validation_level}) -> LLM-Validierung: {current_zone_is_llm_validation}")
+            # Beende aktuellen Callout-Block bei Header-Wechsel
+            if in_callout_block:
+                # Verarbeite gesammelten Callout-Block VOR dem Zonen-Wechsel
+                if callout_block_lines:
+                    block_text = '\n'.join([lines[j] for j in callout_block_lines])
+                    block_call_pairs, _ = extract_callout_relations(block_text)
+                    
+                    # Verarbeite jeden Callout mit Zonen-Kontext
+                    # WICHTIG: Verwende den Zonen-Status VOR dem Header-Wechsel
+                    zone_before_header = current_zone_is_llm_validation
+                    
+                    for k, raw_t in block_call_pairs:
+                        t, sec = parse_link_target(raw_t, note_id)
+                        if not t:
+                            continue
+                        
+                        callout_key = (k, t, sec)
+                        is_blocked = callout_key in existing_chunk_callouts
+                        
+                        if is_blocked:
+                            continue
+                        
+                        # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status VOR Header
+                        if zone_before_header:
+                            rule_id = "candidate:explicit:callout"
+                            provenance = "explicit:callout"
+                        else:
+                            rule_id = "explicit:callout"  # KEIN candidate: für Standard-Zonen
+                            provenance = "explicit:callout"
+                        
+                        payload = {
+                            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
+                            "provenance": provenance,
+                            "rule_id": rule_id,
+                            "confidence": 0.7
+                        }
+                        if sec:
+                            payload["target_section"] = sec
+                        
+                        logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if zone_before_header else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
+                        
+                        edges.append(_edge(
+                            kind=k,
+                            scope="note",
+                            source_id=note_id,
+                            target_id=t,
+                            note_id=note_id,
+                            extra=payload
+                        ))
+                
+                # Reset für nächsten Block
+                in_callout_block = False
+                current_kind = None
+                callout_block_lines = []
            continue
        
-        # WP-24c v4.2.2: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
-        # Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
-        # Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
-        callout_key = (k, t, sec)
-        
-        # WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan Vergleich
-        is_blocked = callout_key in existing_chunk_callouts
-        logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key: ({k}, {t}, {sec}), Raw_Target: {raw_t}, In_Block_List: {is_blocked}, Block_List_Size: {len(existing_chunk_callouts) if existing_chunk_callouts else 0}")
-        
-        if is_blocked:
-            # Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
-            # Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
-            logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key ({k}, {t}, {sec}) ist blockiert - überspringe")
+        # WP-24c v4.5.6: Prüfe auf Callout-Start
+        callout_start_match = re.match(r'^\s*>{1,}\s*\[!edge\]\s*(.*)$', stripped, re.IGNORECASE)
+        if callout_start_match:
+            in_callout_block = True
+            callout_block_lines = [i]  # Start-Zeile
+            header_content = callout_start_match.group(1).strip()
+            # Prüfe, ob Header einen Typ enthält
+            if header_content and re.match(r'^[a-z_]+$', header_content, re.IGNORECASE):
+                current_kind = header_content.lower()
            continue
        
-        # WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
-        # (typischerweise in Edge-Zonen, die nicht gechunkt werden)
-        # WP-24c v4.3.1: Confidence auf 0.7 gesenkt, damit chunk-Scope (1.0) gewinnt
-        payload = {
-            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
-            "provenance": "explicit:callout",
-            "rule_id": "callout:edge",
-            "confidence": 0.7  # WP-24c v4.3.1: Niedrigere Confidence für Note-Scope Callouts
-        }
-        if sec:
-            payload["target_section"] = sec
+        # WP-24c v4.5.6: Sammle Callout-Block-Zeilen
+        if in_callout_block:
+            if stripped.startswith('>'):
+                callout_block_lines.append(i)
+            else:
+                # Callout-Block beendet - verarbeite gesammelte Zeilen
+                if callout_block_lines:
+                    # Extrahiere Callouts aus diesem Block
+                    block_text = '\n'.join([lines[j] for j in callout_block_lines])
+                    block_call_pairs, _ = extract_callout_relations(block_text)
+                    
+                    # Verarbeite jeden Callout mit Zonen-Kontext
+                    for k, raw_t in block_call_pairs:
+                        t, sec = parse_link_target(raw_t, note_id)
+                        if not t:
+                            continue
+                        
+                        callout_key = (k, t, sec)
+                        is_blocked = callout_key in existing_chunk_callouts
+                        
+                        if is_blocked:
+                            continue
+                        
+                        # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
+                        if current_zone_is_llm_validation:
+                            rule_id = "candidate:explicit:callout"
+                            provenance = "explicit:callout"
+                        else:
+                            rule_id = "explicit:callout"  # KEIN candidate: für Standard-Zonen
+                            provenance = "explicit:callout"
+                        
+                        payload = {
+                            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
+                            "provenance": provenance,
+                            "rule_id": rule_id,
+                            "confidence": 0.7
+                        }
+                        if sec:
+                            payload["target_section"] = sec
+                        
+                        logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
+                        
+                        edges.append(_edge(
+                            kind=k,
+                            scope="note",
+                            source_id=note_id,
+                            target_id=t,
+                            note_id=note_id,
+                            extra=payload
+                        ))
+                
+                # Reset für nächsten Block
+                in_callout_block = False
+                current_kind = None
+                callout_block_lines = []
+    
+    # WP-24c v4.5.6: Verarbeite letzten Callout-Block (falls am Ende)
+    if in_callout_block and callout_block_lines:
+        block_text = '\n'.join([lines[j] for j in callout_block_lines])
+        block_call_pairs, _ = extract_callout_relations(block_text)
        
-        # WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan erstellt Note-Scope Callout
-        logger.debug(f"DEBUG-TRACER [Global Scan Create]: Erstelle Note-Scope Callout - Kind: {k}, Target: {t}, Section: {sec}, Raw_Target: {raw_t}, Edge_ID: {payload['edge_id']}, Confidence: {payload['confidence']}")
-        
-        edges.append(_edge(
-            kind=k,
-            scope="note",
-            source_id=note_id,
-            target_id=t,
-            note_id=note_id,
-            extra=payload
-        ))
+        for k, raw_t in block_call_pairs:
+            t, sec = parse_link_target(raw_t, note_id)
+            if not t:
+                continue
+            
+            callout_key = (k, t, sec)
+            is_blocked = callout_key in existing_chunk_callouts
+            
+            if is_blocked:
+                continue
+            
+            # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
+            if current_zone_is_llm_validation:
+                rule_id = "candidate:explicit:callout"
+                provenance = "explicit:callout"
+            else:
+                rule_id = "explicit:callout"
+                provenance = "explicit:callout"
+            
+            payload = {
+                "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
+                "provenance": provenance,
+                "rule_id": rule_id,
+                "confidence": 0.7
+            }
+            if sec:
+                payload["target_section"] = sec
+            
+            logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
+            
+            edges.append(_edge(
+                kind=k,
+                scope="note",
+                source_id=note_id,
+                target_id=t,
+                note_id=note_id,
+                extra=payload
+            ))
    
    return edges