2026-01-12 10:53:20 +01:00
1 changed files with 202 additions and 41 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -208,8 +208,9 @@ def extract_llm_validation_zones(markdown_body: str) -> List[Tuple[str, str]]:
    llm_validation_headers = get_llm_validation_zone_headers()
    for i, line in enumerate(lines):
-        # Prüfe auf Header
+        # Prüfe auf Header (konfiguriertes Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL)
        header_match = re.match(header_pattern, line.strip())
        if header_match:
            header_text = header_match.group(1).strip()
@ -266,11 +267,16 @@ def extract_callouts_from_markdown(
 ) -> List[dict]:
    """
    WP-24c v4.2.1: Extrahiert Callouts aus dem Original-Markdown.
    WP-24c v4.5.6: Header-Status-Maschine für korrekte Zonen-Erkennung.
    Smart Logic: Nur Callouts, die NICHT in Chunks vorkommen (z.B. in Edge-Zonen),
    werden mit scope: "note" angelegt. Callouts, die bereits in Chunks erfasst wurden,
    werden übersprungen, um Duplikate zu vermeiden.
    WP-24c v4.5.6: Prüft für jeden Callout, ob er in einer LLM-Validierungs-Zone liegt.
    - In LLM-Validierungs-Zone: rule_id = "candidate:explicit:callout"
    - In Standard-Zone: rule_id = "explicit:callout" (ohne candidate:)
    Args:
        markdown_body: Original-Markdown-Text (vor Chunking-Filterung)
        note_id: ID der Note
@ -287,43 +293,198 @@ def extract_callouts_from_markdown(
    edges: List[dict] = []
-    # Extrahiere alle Callouts aus dem gesamten Markdown
+    # WP-24c v4.5.6: Header-Status-Maschine - Baue Mapping von Zeilen zu Zonen-Status
-    call_pairs, _ = extract_callout_relations(markdown_body)
+    import os
    import re
-    for k, raw_t in call_pairs:
+    llm_validation_headers = get_llm_validation_zone_headers()
    llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
    # WP-24c v4.5.6: Konfigurierbare Header-Ebene (vollständig über .env steuerbar)
    header_level_pattern = "#" * llm_validation_level
    header_pattern = rf'^{re.escape(header_level_pattern)}\s+(.+?)$'
    lines = markdown_body.split('\n')
    current_zone_is_llm_validation = False
    # WP-24c v4.5.6: Zeile-für-Zeile Verarbeitung mit Zonen-Tracking
    # Extrahiere Callouts direkt während des Durchlaufs, um Zonen-Kontext zu behalten
    current_kind = None
    in_callout_block = False
    callout_block_lines = []  # Sammle Zeilen eines Callout-Blocks
    for i, line in enumerate(lines):
        stripped = line.strip()
        # WP-24c v4.5.6: Prüfe auf Header (Zonen-Wechsel)
        # Verwendet das konfigurierte Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL
        header_match = re.match(header_pattern, stripped)
        if header_match:
            header_text = header_match.group(1).strip()
            # Prüfe, ob dieser Header eine LLM-Validierungs-Zone startet
            # WP-24c v4.5.6: Header-Status-Maschine - korrekte Zonen-Erkennung
            current_zone_is_llm_validation = any(
                header_text.lower() == llm_header.lower()
                for llm_header in llm_validation_headers
            )
            logger.debug(f"DEBUG-TRACER [Zone-Change]: Header '{header_text}' (Level {llm_validation_level}) -> LLM-Validierung: {current_zone_is_llm_validation}")
            # Beende aktuellen Callout-Block bei Header-Wechsel
            if in_callout_block:
                # Verarbeite gesammelten Callout-Block VOR dem Zonen-Wechsel
                if callout_block_lines:
                    block_text = '\n'.join([lines[j] for j in callout_block_lines])
                    block_call_pairs, _ = extract_callout_relations(block_text)
                    # Verarbeite jeden Callout mit Zonen-Kontext
                    # WICHTIG: Verwende den Zonen-Status VOR dem Header-Wechsel
                    zone_before_header = current_zone_is_llm_validation
                    for k, raw_t in block_call_pairs:
                        t, sec = parse_link_target(raw_t, note_id)
                        if not t:
                            continue
        # WP-24c v4.2.2: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
        # Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
        # Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
                        callout_key = (k, t, sec)
        # WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan Vergleich
                        is_blocked = callout_key in existing_chunk_callouts
        logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key: ({k}, {t}, {sec}), Raw_Target: {raw_t}, In_Block_List: {is_blocked}, Block_List_Size: {len(existing_chunk_callouts) if existing_chunk_callouts else 0}")
                        if is_blocked:
            # Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
            # Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
            logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key ({k}, {t}, {sec}) ist blockiert - überspringe")
                            continue
-        # WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
+                        # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status VOR Header
-        # (typischerweise in Edge-Zonen, die nicht gechunkt werden)
+                        if zone_before_header:
-        # WP-24c v4.3.1: Confidence auf 0.7 gesenkt, damit chunk-Scope (1.0) gewinnt
+                            rule_id = "candidate:explicit:callout"
                            provenance = "explicit:callout"
                        else:
                            rule_id = "explicit:callout"  # KEIN candidate: für Standard-Zonen
                            provenance = "explicit:callout"
                        payload = {
                            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
-            "provenance": "explicit:callout",
+                            "provenance": provenance,
-            "rule_id": "callout:edge",
+                            "rule_id": rule_id,
-            "confidence": 0.7  # WP-24c v4.3.1: Niedrigere Confidence für Note-Scope Callouts
+                            "confidence": 0.7
                        }
                        if sec:
                            payload["target_section"] = sec
-        # WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan erstellt Note-Scope Callout
+                        logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if zone_before_header else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
-        logger.debug(f"DEBUG-TRACER [Global Scan Create]: Erstelle Note-Scope Callout - Kind: {k}, Target: {t}, Section: {sec}, Raw_Target: {raw_t}, Edge_ID: {payload['edge_id']}, Confidence: {payload['confidence']}")
+                        
                        edges.append(_edge(
                            kind=k,
                            scope="note",
                            source_id=note_id,
                            target_id=t,
                            note_id=note_id,
                            extra=payload
                        ))
                # Reset für nächsten Block
                in_callout_block = False
                current_kind = None
                callout_block_lines = []
            continue
        # WP-24c v4.5.6: Prüfe auf Callout-Start
        callout_start_match = re.match(r'^\s*>{1,}\s*\[!edge\]\s*(.*)$', stripped, re.IGNORECASE)
        if callout_start_match:
            in_callout_block = True
            callout_block_lines = [i]  # Start-Zeile
            header_content = callout_start_match.group(1).strip()
            # Prüfe, ob Header einen Typ enthält
            if header_content and re.match(r'^[a-z_]+$', header_content, re.IGNORECASE):
                current_kind = header_content.lower()
            continue
        # WP-24c v4.5.6: Sammle Callout-Block-Zeilen
        if in_callout_block:
            if stripped.startswith('>'):
                callout_block_lines.append(i)
            else:
                # Callout-Block beendet - verarbeite gesammelte Zeilen
                if callout_block_lines:
                    # Extrahiere Callouts aus diesem Block
                    block_text = '\n'.join([lines[j] for j in callout_block_lines])
                    block_call_pairs, _ = extract_callout_relations(block_text)
                    # Verarbeite jeden Callout mit Zonen-Kontext
                    for k, raw_t in block_call_pairs:
                        t, sec = parse_link_target(raw_t, note_id)
                        if not t:
                            continue
                        callout_key = (k, t, sec)
                        is_blocked = callout_key in existing_chunk_callouts
                        if is_blocked:
                            continue
                        # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
                        if current_zone_is_llm_validation:
                            rule_id = "candidate:explicit:callout"
                            provenance = "explicit:callout"
                        else:
                            rule_id = "explicit:callout"  # KEIN candidate: für Standard-Zonen
                            provenance = "explicit:callout"
                        payload = {
                            "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
                            "provenance": provenance,
                            "rule_id": rule_id,
                            "confidence": 0.7
                        }
                        if sec:
                            payload["target_section"] = sec
                        logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
                        edges.append(_edge(
                            kind=k,
                            scope="note",
                            source_id=note_id,
                            target_id=t,
                            note_id=note_id,
                            extra=payload
                        ))
                # Reset für nächsten Block
                in_callout_block = False
                current_kind = None
                callout_block_lines = []
    # WP-24c v4.5.6: Verarbeite letzten Callout-Block (falls am Ende)
    if in_callout_block and callout_block_lines:
        block_text = '\n'.join([lines[j] for j in callout_block_lines])
        block_call_pairs, _ = extract_callout_relations(block_text)
        for k, raw_t in block_call_pairs:
            t, sec = parse_link_target(raw_t, note_id)
            if not t:
                continue
            callout_key = (k, t, sec)
            is_blocked = callout_key in existing_chunk_callouts
            if is_blocked:
                continue
            # WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
            if current_zone_is_llm_validation:
                rule_id = "candidate:explicit:callout"
                provenance = "explicit:callout"
            else:
                rule_id = "explicit:callout"
                provenance = "explicit:callout"
            payload = {
                "edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
                "provenance": provenance,
                "rule_id": rule_id,
                "confidence": 0.7
            }
            if sec:
                payload["target_section"] = sec
            logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
            edges.append(_edge(
                kind=k,