Enhance LLM validation zone extraction in graph_derive_edges.py

Implement support for H2 headers in LLM validation zone detection, allowing for improved flexibility in header recognition. Update the extraction logic to track zones during callout processing, ensuring accurate differentiation between LLM validation and standard zones. This enhancement improves the handling of callouts and their associated metadata, contributing to more precise edge construction.
2026-01-11 20:58:33 +01:00 · 2026-01-11 20:58:33 +01:00 · ea0fd951f2
commit ea0fd951f2
parent c8c828c8a8
1 changed files with 202 additions and 41 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -208,8 +208,9 @@ def extract_llm_validation_zones(markdown_body: str) -> List[Tuple[str, str]]:
    llm_validation_headers = get_llm_validation_zone_headers()
    for i, line in enumerate(lines):
-        # Prüfe auf Header
+        # Prüfe auf Header (konfiguriertes Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL)
        header_match = re.match(header_pattern, line.strip())
        if header_match:
            header_text = header_match.group(1).strip()
@ -266,11 +267,16 @@ def extract_callouts_from_markdown(
 ) -> List[dict]:
    """
    WP-24c v4.2.1: Extrahiert Callouts aus dem Original-Markdown.
    WP-24c v4.5.6: Header-Status-Maschine für korrekte Zonen-Erkennung.
    Smart Logic: Nur Callouts, die NICHT in Chunks vorkommen (z.B. in Edge-Zonen),
    werden mit scope: "note" angelegt. Callouts, die bereits in Chunks erfasst wurden,
    werden übersprungen, um Duplikate zu vermeiden.
    WP-24c v4.5.6: Prüft für jeden Callout, ob er in einer LLM-Validierungs-Zone liegt.
    - In LLM-Validierungs-Zone: rule_id = "candidate:explicit:callout"
    - In Standard-Zone: rule_id = "explicit:callout" (ohne candidate:)
    Args:
        markdown_body: Original-Markdown-Text (vor Chunking-Filterung)
        note_id: ID der Note
@ -287,52 +293,207 @@ def extract_callouts_from_markdown(
    edges: List[dict] = []
-    # Extrahiere alle Callouts aus dem gesamten Markdown
+    # WP-24c v4.5.6: Header-Status-Maschine - Baue Mapping von Zeilen zu Zonen-Status
-    call_pairs, _ = extract_callout_relations(markdown_body)
+    import os
    import re
-    for k, raw_t in call_pairs:
+    llm_validation_headers = get_llm_validation_zone_headers()
-        t, sec = parse_link_target(raw_t, note_id)
+    llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
-        if not t:
+    # WP-24c v4.5.6: Konfigurierbare Header-Ebene (vollständig über .env steuerbar)
    header_level_pattern = "#" * llm_validation_level
    header_pattern = rf'^{re.escape(header_level_pattern)}\s+(.+?)$'
    lines = markdown_body.split('\n')
    current_zone_is_llm_validation = False
    # WP-24c v4.5.6: Zeile-für-Zeile Verarbeitung mit Zonen-Tracking
    # Extrahiere Callouts direkt während des Durchlaufs, um Zonen-Kontext zu behalten
    current_kind = None
    in_callout_block = False
    callout_block_lines = []  # Sammle Zeilen eines Callout-Blocks
    for i, line in enumerate(lines):
        stripped = line.strip()
        # WP-24c v4.5.6: Prüfe auf Header (Zonen-Wechsel)
        # Verwendet das konfigurierte Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL
        header_match = re.match(header_pattern, stripped)
        if header_match:
            header_text = header_match.group(1).strip()
            # Prüfe, ob dieser Header eine LLM-Validierungs-Zone startet
            # WP-24c v4.5.6: Header-Status-Maschine - korrekte Zonen-Erkennung
            current_zone_is_llm_validation = any(
                header_text.lower() == llm_header.lower()
                for llm_header in llm_validation_headers
            )
            logger.debug(f"DEBUG-TRACER [Zone-Change]: Header '{header_text}' (Level {llm_validation_level}) -> LLM-Validierung: {current_zone_is_llm_validation}")
            # Beende aktuellen Callout-Block bei Header-Wechsel
            if in_callout_block:
                # Verarbeite gesammelten Callout-Block VOR dem Zonen-Wechsel
                if callout_block_lines:
                    block_text = '\n'.join([lines[j] for j in callout_block_lines])
                    block_call_pairs, _ = extract_callout_relations(block_text)
                    # Verarbeite jeden Callout mit Zonen-Kontext
                    # WICHTIG: Verwende den Zonen-Status VOR dem Header-Wechsel
                    zone_before_header = current_zone_is_llm_validation
                    for k, raw_t in block_call_pairs:
                        t, sec = parse_link_target(raw_t, note_id)
                        if not t:
                            continue
                        callout_key = (k, t, sec)
                        is_blocked = callout_key in existing_chunk_callouts
                        if is_blocked:
                            continue
                        # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status VOR Header
                        if zone_before_header:
                            rule_id = "candidate:explicit:callout"
                            provenance = "explicit:callout"
                        else:
                            rule_id = "explicit:callout"  # KEIN candidate: für Standard-Zonen
                            provenance = "explicit:callout"
                        payload = {
                            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
                            "provenance": provenance,
                            "rule_id": rule_id,
                            "confidence": 0.7
                        }
                        if sec:
                            payload["target_section"] = sec
                        logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if zone_before_header else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
                        edges.append(_edge(
                            kind=k,
                            scope="note",
                            source_id=note_id,
                            target_id=t,
                            note_id=note_id,
                            extra=payload
                        ))
                # Reset für nächsten Block
                in_callout_block = False
                current_kind = None
                callout_block_lines = []
            continue
-        # WP-24c v4.2.2: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
+        # WP-24c v4.5.6: Prüfe auf Callout-Start
-        # Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
+        callout_start_match = re.match(r'^\s*>{1,}\s*\[!edge\]\s*(.*)$', stripped, re.IGNORECASE)
-        # Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
+        if callout_start_match:
-        callout_key = (k, t, sec)
+            in_callout_block = True
-        
+            callout_block_lines = [i]  # Start-Zeile
-        # WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan Vergleich
+            header_content = callout_start_match.group(1).strip()
-        is_blocked = callout_key in existing_chunk_callouts
+            # Prüfe, ob Header einen Typ enthält
-        logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key: ({k}, {t}, {sec}), Raw_Target: {raw_t}, In_Block_List: {is_blocked}, Block_List_Size: {len(existing_chunk_callouts) if existing_chunk_callouts else 0}")
+            if header_content and re.match(r'^[a-z_]+$', header_content, re.IGNORECASE):
-        
+                current_kind = header_content.lower()
        if is_blocked:
            # Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
            # Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
            logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key ({k}, {t}, {sec}) ist blockiert - überspringe")
            continue
-        # WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
+        # WP-24c v4.5.6: Sammle Callout-Block-Zeilen
-        # (typischerweise in Edge-Zonen, die nicht gechunkt werden)
+        if in_callout_block:
-        # WP-24c v4.3.1: Confidence auf 0.7 gesenkt, damit chunk-Scope (1.0) gewinnt
+            if stripped.startswith('>'):
-        payload = {
+                callout_block_lines.append(i)
-            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
+            else:
-            "provenance": "explicit:callout",
+                # Callout-Block beendet - verarbeite gesammelte Zeilen
-            "rule_id": "callout:edge",
+                if callout_block_lines:
-            "confidence": 0.7  # WP-24c v4.3.1: Niedrigere Confidence für Note-Scope Callouts
+                    # Extrahiere Callouts aus diesem Block
-        }
+                    block_text = '\n'.join([lines[j] for j in callout_block_lines])
-        if sec:
+                    block_call_pairs, _ = extract_callout_relations(block_text)
            payload["target_section"] = sec
-        # WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan erstellt Note-Scope Callout
+                    # Verarbeite jeden Callout mit Zonen-Kontext
-        logger.debug(f"DEBUG-TRACER [Global Scan Create]: Erstelle Note-Scope Callout - Kind: {k}, Target: {t}, Section: {sec}, Raw_Target: {raw_t}, Edge_ID: {payload['edge_id']}, Confidence: {payload['confidence']}")
+                    for k, raw_t in block_call_pairs:
                        t, sec = parse_link_target(raw_t, note_id)
                        if not t:
                            continue
-        edges.append(_edge(
+                        callout_key = (k, t, sec)
-            kind=k,
+                        is_blocked = callout_key in existing_chunk_callouts
-            scope="note",
+                        
-            source_id=note_id,
+                        if is_blocked:
-            target_id=t,
+                            continue
-            note_id=note_id,
+                        
-            extra=payload
+                        # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
-        ))
+                        if current_zone_is_llm_validation:
                            rule_id = "candidate:explicit:callout"
                            provenance = "explicit:callout"
                        else:
                            rule_id = "explicit:callout"  # KEIN candidate: für Standard-Zonen
                            provenance = "explicit:callout"
                        payload = {
                            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
                            "provenance": provenance,
                            "rule_id": rule_id,
                            "confidence": 0.7
                        }
                        if sec:
                            payload["target_section"] = sec
                        logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
                        edges.append(_edge(
                            kind=k,
                            scope="note",
                            source_id=note_id,
                            target_id=t,
                            note_id=note_id,
                            extra=payload
                        ))
                # Reset für nächsten Block
                in_callout_block = False
                current_kind = None
                callout_block_lines = []
    # WP-24c v4.5.6: Verarbeite letzten Callout-Block (falls am Ende)
    if in_callout_block and callout_block_lines:
        block_text = '\n'.join([lines[j] for j in callout_block_lines])
        block_call_pairs, _ = extract_callout_relations(block_text)
        for k, raw_t in block_call_pairs:
            t, sec = parse_link_target(raw_t, note_id)
            if not t:
                continue
            callout_key = (k, t, sec)
            is_blocked = callout_key in existing_chunk_callouts
            if is_blocked:
                continue
            # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
            if current_zone_is_llm_validation:
                rule_id = "candidate:explicit:callout"
                provenance = "explicit:callout"
            else:
                rule_id = "explicit:callout"
                provenance = "explicit:callout"
            payload = {
                "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
                "provenance": provenance,
                "rule_id": rule_id,
                "confidence": 0.7
            }
            if sec:
                payload["target_section"] = sec
            logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
            edges.append(_edge(
                kind=k,
                scope="note",
                source_id=note_id,
                target_id=t,
                note_id=note_id,
                extra=payload
            ))
    return edges