Enhance LLM validation zone extraction in graph_derive_edges.py

Implement support for H2 headers in LLM validation zone detection, allowing for improved flexibility in header recognition. Update the extraction logic to track zones during callout processing, ensuring accurate differentiation between LLM validation and standard zones. This enhancement improves the handling of callouts and their associated metadata, contributing to more precise edge construction.
2026-01-11 20:58:33 +01:00 · 2026-01-11 20:58:33 +01:00 · ea0fd951f2
commit ea0fd951f2
parent c8c828c8a8
1 changed files with 202 additions and 41 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -208,8 +208,9 @@ def extract_llm_validation_zones(markdown_body: str) -> List[Tuple[str, str]]:
    llm_validation_headers = get_llm_validation_zone_headers()
    
    for i, line in enumerate(lines):
-        # Prüfe auf Header
+        # Prüfe auf Header (konfiguriertes Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL)
        header_match = re.match(header_pattern, line.strip())
+        
        if header_match:
            header_text = header_match.group(1).strip()
            
@ -266,11 +267,16 @@ def extract_callouts_from_markdown(
 ) -> List[dict]:
    """
    WP-24c v4.2.1: Extrahiert Callouts aus dem Original-Markdown.
+    WP-24c v4.5.6: Header-Status-Maschine für korrekte Zonen-Erkennung.
    
    Smart Logic: Nur Callouts, die NICHT in Chunks vorkommen (z.B. in Edge-Zonen),
    werden mit scope: "note" angelegt. Callouts, die bereits in Chunks erfasst wurden,
    werden übersprungen, um Duplikate zu vermeiden.
    
+    WP-24c v4.5.6: Prüft für jeden Callout, ob er in einer LLM-Validierungs-Zone liegt.
+    - In LLM-Validierungs-Zone: rule_id = "candidate:explicit:callout"
+    - In Standard-Zone: rule_id = "explicit:callout" (ohne candidate:)
+    
    Args:
        markdown_body: Original-Markdown-Text (vor Chunking-Filterung)
        note_id: ID der Note
@ -287,52 +293,207 @@ def extract_callouts_from_markdown(
    
    edges: List[dict] = []
    
-    # Extrahiere alle Callouts aus dem gesamten Markdown
-    call_pairs, _ = extract_callout_relations(markdown_body)
+    # WP-24c v4.5.6: Header-Status-Maschine - Baue Mapping von Zeilen zu Zonen-Status
+    import os
+    import re
    
-    for k, raw_t in call_pairs:
-        t, sec = parse_link_target(raw_t, note_id)
-        if not t:
+    llm_validation_headers = get_llm_validation_zone_headers()
+    llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
+    # WP-24c v4.5.6: Konfigurierbare Header-Ebene (vollständig über .env steuerbar)
+    header_level_pattern = "#" * llm_validation_level
+    header_pattern = rf'^{re.escape(header_level_pattern)}\s+(.+?)$'
+    
+    lines = markdown_body.split('\n')
+    current_zone_is_llm_validation = False
+    
+    # WP-24c v4.5.6: Zeile-für-Zeile Verarbeitung mit Zonen-Tracking
+    # Extrahiere Callouts direkt während des Durchlaufs, um Zonen-Kontext zu behalten
+    current_kind = None
+    in_callout_block = False
+    callout_block_lines = []  # Sammle Zeilen eines Callout-Blocks
+    
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+        
+        # WP-24c v4.5.6: Prüfe auf Header (Zonen-Wechsel)
+        # Verwendet das konfigurierte Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL
+        header_match = re.match(header_pattern, stripped)
+        
+        if header_match:
+            header_text = header_match.group(1).strip()
+            # Prüfe, ob dieser Header eine LLM-Validierungs-Zone startet
+            # WP-24c v4.5.6: Header-Status-Maschine - korrekte Zonen-Erkennung
+            current_zone_is_llm_validation = any(
+                header_text.lower() == llm_header.lower()
+                for llm_header in llm_validation_headers
+            )
+            logger.debug(f"DEBUG-TRACER [Zone-Change]: Header '{header_text}' (Level {llm_validation_level}) -> LLM-Validierung: {current_zone_is_llm_validation}")
+            # Beende aktuellen Callout-Block bei Header-Wechsel
+            if in_callout_block:
+                # Verarbeite gesammelten Callout-Block VOR dem Zonen-Wechsel
+                if callout_block_lines:
+                    block_text = '\n'.join([lines[j] for j in callout_block_lines])
+                    block_call_pairs, _ = extract_callout_relations(block_text)
+                    
+                    # Verarbeite jeden Callout mit Zonen-Kontext
+                    # WICHTIG: Verwende den Zonen-Status VOR dem Header-Wechsel
+                    zone_before_header = current_zone_is_llm_validation
+                    
+                    for k, raw_t in block_call_pairs:
+                        t, sec = parse_link_target(raw_t, note_id)
+                        if not t:
+                            continue
+                        
+                        callout_key = (k, t, sec)
+                        is_blocked = callout_key in existing_chunk_callouts
+                        
+                        if is_blocked:
+                            continue
+                        
+                        # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status VOR Header
+                        if zone_before_header:
+                            rule_id = "candidate:explicit:callout"
+                            provenance = "explicit:callout"
+                        else:
+                            rule_id = "explicit:callout"  # KEIN candidate: für Standard-Zonen
+                            provenance = "explicit:callout"
+                        
+                        payload = {
+                            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
+                            "provenance": provenance,
+                            "rule_id": rule_id,
+                            "confidence": 0.7
+                        }
+                        if sec:
+                            payload["target_section"] = sec
+                        
+                        logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if zone_before_header else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
+                        
+                        edges.append(_edge(
+                            kind=k,
+                            scope="note",
+                            source_id=note_id,
+                            target_id=t,
+                            note_id=note_id,
+                            extra=payload
+                        ))
+                
+                # Reset für nächsten Block
+                in_callout_block = False
+                current_kind = None
+                callout_block_lines = []
            continue
        
-        # WP-24c v4.2.2: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
-        # Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
-        # Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
-        callout_key = (k, t, sec)
-        
-        # WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan Vergleich
-        is_blocked = callout_key in existing_chunk_callouts
-        logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key: ({k}, {t}, {sec}), Raw_Target: {raw_t}, In_Block_List: {is_blocked}, Block_List_Size: {len(existing_chunk_callouts) if existing_chunk_callouts else 0}")
-        
-        if is_blocked:
-            # Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
-            # Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
-            logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key ({k}, {t}, {sec}) ist blockiert - überspringe")
+        # WP-24c v4.5.6: Prüfe auf Callout-Start
+        callout_start_match = re.match(r'^\s*>{1,}\s*\[!edge\]\s*(.*)$', stripped, re.IGNORECASE)
+        if callout_start_match:
+            in_callout_block = True
+            callout_block_lines = [i]  # Start-Zeile
+            header_content = callout_start_match.group(1).strip()
+            # Prüfe, ob Header einen Typ enthält
+            if header_content and re.match(r'^[a-z_]+$', header_content, re.IGNORECASE):
+                current_kind = header_content.lower()
            continue
        
-        # WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
-        # (typischerweise in Edge-Zonen, die nicht gechunkt werden)
-        # WP-24c v4.3.1: Confidence auf 0.7 gesenkt, damit chunk-Scope (1.0) gewinnt
-        payload = {
-            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
-            "provenance": "explicit:callout",
-            "rule_id": "callout:edge",
-            "confidence": 0.7  # WP-24c v4.3.1: Niedrigere Confidence für Note-Scope Callouts
-        }
-        if sec:
-            payload["target_section"] = sec
+        # WP-24c v4.5.6: Sammle Callout-Block-Zeilen
+        if in_callout_block:
+            if stripped.startswith('>'):
+                callout_block_lines.append(i)
+            else:
+                # Callout-Block beendet - verarbeite gesammelte Zeilen
+                if callout_block_lines:
+                    # Extrahiere Callouts aus diesem Block
+                    block_text = '\n'.join([lines[j] for j in callout_block_lines])
+                    block_call_pairs, _ = extract_callout_relations(block_text)
+                    
+                    # Verarbeite jeden Callout mit Zonen-Kontext
+                    for k, raw_t in block_call_pairs:
+                        t, sec = parse_link_target(raw_t, note_id)
+                        if not t:
+                            continue
+                        
+                        callout_key = (k, t, sec)
+                        is_blocked = callout_key in existing_chunk_callouts
+                        
+                        if is_blocked:
+                            continue
+                        
+                        # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
+                        if current_zone_is_llm_validation:
+                            rule_id = "candidate:explicit:callout"
+                            provenance = "explicit:callout"
+                        else:
+                            rule_id = "explicit:callout"  # KEIN candidate: für Standard-Zonen
+                            provenance = "explicit:callout"
+                        
+                        payload = {
+                            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
+                            "provenance": provenance,
+                            "rule_id": rule_id,
+                            "confidence": 0.7
+                        }
+                        if sec:
+                            payload["target_section"] = sec
+                        
+                        logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
+                        
+                        edges.append(_edge(
+                            kind=k,
+                            scope="note",
+                            source_id=note_id,
+                            target_id=t,
+                            note_id=note_id,
+                            extra=payload
+                        ))
+                
+                # Reset für nächsten Block
+                in_callout_block = False
+                current_kind = None
+                callout_block_lines = []
+    
+    # WP-24c v4.5.6: Verarbeite letzten Callout-Block (falls am Ende)
+    if in_callout_block and callout_block_lines:
+        block_text = '\n'.join([lines[j] for j in callout_block_lines])
+        block_call_pairs, _ = extract_callout_relations(block_text)
        
-        # WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan erstellt Note-Scope Callout
-        logger.debug(f"DEBUG-TRACER [Global Scan Create]: Erstelle Note-Scope Callout - Kind: {k}, Target: {t}, Section: {sec}, Raw_Target: {raw_t}, Edge_ID: {payload['edge_id']}, Confidence: {payload['confidence']}")
-        
-        edges.append(_edge(
-            kind=k,
-            scope="note",
-            source_id=note_id,
-            target_id=t,
-            note_id=note_id,
-            extra=payload
-        ))
+        for k, raw_t in block_call_pairs:
+            t, sec = parse_link_target(raw_t, note_id)
+            if not t:
+                continue
+            
+            callout_key = (k, t, sec)
+            is_blocked = callout_key in existing_chunk_callouts
+            
+            if is_blocked:
+                continue
+            
+            # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
+            if current_zone_is_llm_validation:
+                rule_id = "candidate:explicit:callout"
+                provenance = "explicit:callout"
+            else:
+                rule_id = "explicit:callout"
+                provenance = "explicit:callout"
+            
+            payload = {
+                "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
+                "provenance": provenance,
+                "rule_id": rule_id,
+                "confidence": 0.7
+            }
+            if sec:
+                payload["target_section"] = sec
+            
+            logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
+            
+            edges.append(_edge(
+                kind=k,
+                scope="note",
+                source_id=note_id,
+                target_id=t,
+                note_id=note_id,
+                extra=payload
+            ))
    
    return edges