Add LLM validation zone extraction and configuration support in graph_derive_edges.py

Implement functions to extract LLM validation zones from Markdown, allowing for configurable header identification via environment variables. Enhance the existing note scope zone extraction to differentiate between note scope and LLM validation zones. Update edge building logic to handle LLM validation edges with a 'candidate:' prefix, ensuring proper processing and avoiding duplicates in global scans. This update improves the overall handling of edge data and enhances the flexibility of the extraction process.
2026-01-11 20:19:12 +01:00 · 2026-01-11 20:19:12 +01:00 · c8c828c8a8
commit c8c828c8a8
parent 716a063849
1 changed files with 187 additions and 5 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -65,9 +65,32 @@ def get_note_scope_zone_headers() -> List[str]:
        ]
    return header_list
 # WP-24c v4.5.6: Header-basierte Identifikation von LLM-Validierungs-Zonen
 # Konfigurierbar via MINDNET_LLM_VALIDATION_HEADERS (komma-separiert)
 def get_llm_validation_zone_headers() -> List[str]:
    """
    Lädt die konfigurierten Header-Namen für LLM-Validierungs-Zonen.
    Fallback auf Defaults, falls nicht konfiguriert.
    """
    import os
    headers_env = os.getenv(
        "MINDNET_LLM_VALIDATION_HEADERS",
        "Unzugeordnete Kanten,Edge Pool,Candidates"
    )
    header_list = [h.strip() for h in headers_env.split(",") if h.strip()]
    # Fallback auf Defaults, falls leer
    if not header_list:
        header_list = [
            "Unzugeordnete Kanten",
            "Edge Pool",
            "Candidates"
        ]
    return header_list
 def extract_note_scope_zones(markdown_body: str) -> List[Tuple[str, str]]:
    """
    WP-24c v4.2.0: Extrahiert Note-Scope Zonen aus Markdown.
    WP-24c v4.5.6: Unterscheidet zwischen Note-Scope-Zonen und LLM-Validierungs-Zonen.
    Identifiziert Sektionen mit spezifischen Headern (konfigurierbar via .env)
    und extrahiert alle darin enthaltenen Links.
@ -93,21 +116,30 @@ def extract_note_scope_zones(markdown_body: str) -> List[Tuple[str, str]]:
    in_zone = False
    zone_content = []
    # WP-24c v4.5.6: Lade beide Header-Listen für Unterscheidung
    zone_headers = get_note_scope_zone_headers()
    llm_validation_headers = get_llm_validation_zone_headers()
    for i, line in enumerate(lines):
        # Prüfe auf Header
        header_match = re.match(header_pattern, line.strip())
        if header_match:
            header_text = header_match.group(1).strip()
-            # Prüfe, ob dieser Header eine Note-Scope Zone ist
+            # WP-24c v4.5.6: Prüfe, ob dieser Header eine Note-Scope Zone ist
-            # WP-24c v4.2.0: Dynamisches Laden der konfigurierten Header
+            # (NICHT eine LLM-Validierungs-Zone - diese werden separat behandelt)
            zone_headers = get_note_scope_zone_headers()
            is_zone_header = any(
                header_text.lower() == zone_header.lower() 
                for zone_header in zone_headers
            )
-            if is_zone_header:
+            # WP-24c v4.5.6: Ignoriere LLM-Validierungs-Zonen hier (werden separat verarbeitet)
            is_llm_validation = any(
                header_text.lower() == llm_header.lower()
                for llm_header in llm_validation_headers
            )
            if is_zone_header and not is_llm_validation:
                in_zone = True
                zone_content = []
                continue
@ -143,6 +175,90 @@ def extract_note_scope_zones(markdown_body: str) -> List[Tuple[str, str]]:
    return edges
 def extract_llm_validation_zones(markdown_body: str) -> List[Tuple[str, str]]:
    """
    WP-24c v4.5.6: Extrahiert LLM-Validierungs-Zonen aus Markdown.
    Identifiziert Sektionen mit LLM-Validierungs-Headern (konfigurierbar via .env)
    und extrahiert alle darin enthaltenen Links (Wikilinks, Typed Relations, Callouts).
    Diese Kanten erhalten das Präfix "candidate:" in der rule_id.
    Returns:
        List[Tuple[str, str]]: Liste von (kind, target) Tupeln
    """
    if not markdown_body:
        return []
    edges: List[Tuple[str, str]] = []
    # WP-24c v4.5.6: Konfigurierbare Header-Ebene für LLM-Validierung
    import os
    import re
    llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
    header_level_pattern = "#" * llm_validation_level
    # Regex für Header-Erkennung (konfigurierbare Ebene)
    header_pattern = rf'^{re.escape(header_level_pattern)}\s+(.+?)$'
    lines = markdown_body.split('\n')
    in_zone = False
    zone_content = []
    # WP-24c v4.5.6: Lade LLM-Validierungs-Header
    llm_validation_headers = get_llm_validation_zone_headers()
    for i, line in enumerate(lines):
        # Prüfe auf Header
        header_match = re.match(header_pattern, line.strip())
        if header_match:
            header_text = header_match.group(1).strip()
            # WP-24c v4.5.6: Prüfe, ob dieser Header eine LLM-Validierungs-Zone ist
            is_llm_validation = any(
                header_text.lower() == llm_header.lower()
                for llm_header in llm_validation_headers
            )
            if is_llm_validation:
                in_zone = True
                zone_content = []
                continue
            else:
                # Neuer Header gefunden, der keine Zone ist -> Zone beendet
                if in_zone:
                    # Verarbeite gesammelten Inhalt
                    zone_text = '\n'.join(zone_content)
                    # Extrahiere Typed Relations
                    typed, _ = extract_typed_relations(zone_text)
                    edges.extend(typed)
                    # Extrahiere Wikilinks (als related_to)
                    wikilinks = extract_wikilinks(zone_text)
                    for wl in wikilinks:
                        edges.append(("related_to", wl))
                    # WP-24c v4.5.6: Extrahiere auch Callouts aus LLM-Validierungs-Zonen
                    callout_pairs, _ = extract_callout_relations(zone_text)
                    edges.extend(callout_pairs)
                in_zone = False
                zone_content = []
        # Sammle Inhalt, wenn wir in einer Zone sind
        if in_zone:
            zone_content.append(line)
    # Verarbeite letzte Zone (falls am Ende des Dokuments)
    if in_zone and zone_content:
        zone_text = '\n'.join(zone_content)
        typed, _ = extract_typed_relations(zone_text)
        edges.extend(typed)
        wikilinks = extract_wikilinks(zone_text)
        for wl in wikilinks:
            edges.append(("related_to", wl))
        # WP-24c v4.5.6: Extrahiere auch Callouts aus LLM-Validierungs-Zonen
        callout_pairs, _ = extract_callout_relations(zone_text)
        edges.extend(callout_pairs)
    return edges
 def extract_callouts_from_markdown(
    markdown_body: str, 
    note_id: str,
@ -249,7 +365,9 @@ def build_edges_for_note(
    note_type = _get(chunks[0], "type") if chunks else "concept"
    # WP-24c v4.2.0: Note-Scope Zonen Extraktion (VOR Chunk-Verarbeitung)
    # WP-24c v4.5.6: Separate Behandlung von LLM-Validierungs-Zonen
    note_scope_edges: List[dict] = []
    llm_validation_edges: List[dict] = []
    if markdown_body:
        # 1. Note-Scope Zonen (Wikilinks und Typed Relations)
@ -280,6 +398,55 @@ def build_edges_for_note(
                extra=payload
            ))
        # WP-24c v4.5.6: LLM-Validierungs-Zonen (mit candidate: Präfix)
        llm_validation_links = extract_llm_validation_zones(markdown_body)
        for kind, raw_target in llm_validation_links:
            target, sec = parse_link_target(raw_target, note_id)
            if not target:
                continue
            # WP-24c v4.5.6: LLM-Validierungs-Kanten mit scope: "note" und rule_id: "candidate:..."
            # Diese werden gegen alle Chunks der Note geprüft
            # Bestimme Provenance basierend auf Link-Typ
            if kind == "related_to":
                # Wikilink in LLM-Validierungs-Zone
                provenance = "explicit:wikilink"
            else:
                # Typed Relation oder Callout in LLM-Validierungs-Zone
                provenance = "explicit"
            payload = {
                "edge_id": _mk_edge_id(kind, note_id, target, "note", target_section=sec),
                "provenance": provenance,
                "rule_id": f"candidate:{provenance}",  # WP-24c v4.5.6: Zonen-Priorität - candidate: Präfix
                "confidence": PROVENANCE_PRIORITY.get(provenance, 0.90)
            }
            if sec:
                payload["target_section"] = sec
            llm_validation_edges.append(_edge(
                kind=kind,
                scope="note",
                source_id=note_id,  # WP-24c v4.5.6: source_id = note_id (Note-Scope für LLM-Validierung)
                target_id=target,
                note_id=note_id,
                extra=payload
            ))
            # WP-24c v4.5.6: Füge Callouts aus LLM-Validierungs-Zonen zu all_chunk_callout_keys hinzu
            # damit sie nicht im globalen Scan doppelt verarbeitet werden
            # (Nur für Callouts, nicht für Wikilinks oder Typed Relations)
            # Callouts werden in extract_llm_validation_zones bereits extrahiert
            # und müssen daher aus dem globalen Scan ausgeschlossen werden
            # Hinweis: extract_llm_validation_zones gibt auch Callouts zurück (als (kind, target) Tupel)
            # Daher müssen wir prüfen, ob es sich um einen Callout handelt
            # (Callouts haben typischerweise spezifische kinds wie "depends_on", "related_to", etc.)
            # Für jetzt nehmen wir an, dass alle Links aus LLM-Validierungs-Zonen als "bereits verarbeitet" markiert werden
            # Dies verhindert Duplikate im globalen Scan
            callout_key = (kind, target, sec)
            all_chunk_callout_keys.add(callout_key)
            logger.debug(f"Note [{note_id}]: LLM-Validierungs-Zone Callout-Key hinzugefügt: ({kind}, {target}, {sec})")
    # 1) Struktur-Kanten (Internal: belongs_to, next/prev)
    # Diese erhalten die Provenienz 'structure' und sind in der Registry geschützt.
    for idx, ch in enumerate(chunks):
@ -400,11 +567,23 @@ def build_edges_for_note(
            if t:
                # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
                # WP-24c v4.3.1: explicit:callout erhält Confidence 1.0 für Präzisions-Priorität
                # WP-24c v4.5.6: candidate: Präfix NUR für global_pool (aus LLM-Validierungs-Zonen)
                # Normale Callouts im Fließtext erhalten KEIN candidate: Präfix
                confidence = 1.0 if p == "explicit:callout" else PROVENANCE_PRIORITY.get(p, 0.90)
                # WP-24c v4.5.6: rule_id nur mit candidate: für global_pool (LLM-Validierungs-Zonen)
                # explicit:callout (normale Callouts im Fließtext) erhalten KEIN candidate: Präfix
                if p == "global_pool":
                    rule_id = f"candidate:{p}"
                elif p == "explicit:callout":
                    rule_id = "explicit:callout"  # WP-24c v4.5.6: Kein candidate: für Fließtext-Callouts
                else:
                    rule_id = p  # Andere Provenances ohne candidate:
                payload = {
                    "chunk_id": cid, 
                    "edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
-                    "provenance": p, "rule_id": f"candidate:{p}", "confidence": confidence
+                    "provenance": p, "rule_id": rule_id, "confidence": confidence
                }
                if sec: payload["target_section"] = sec
                edges.append(_edge(k, "chunk", cid, t, note_id, payload))
@ -483,7 +662,10 @@ def build_edges_for_note(
            }))
    # 4) WP-24c v4.2.0: Note-Scope Edges hinzufügen (VOR De-Duplizierung)
    # WP-24c v4.2.0: Note-Scope Edges hinzufügen
    edges.extend(note_scope_edges)
    # WP-24c v4.5.6: LLM-Validierungs-Edges hinzufügen (mit candidate: Präfix)
    edges.extend(llm_validation_edges)
    # 5) WP-24c v4.2.9 Fix B PHASE 2 (Deduplizierung): Callout-Extraktion aus Markdown
    # Der globale Scan des markdown_body nutzt all_chunk_callout_keys als Ausschlusskriterium.