2026-01-12 10:53:20 +01:00
1 changed files with 187 additions and 5 deletions
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -65,9 +65,32 @@ def get_note_scope_zone_headers() -> List[str]:
        ]
    return header_list

+# WP-24c v4.5.6: Header-basierte Identifikation von LLM-Validierungs-Zonen
+# Konfigurierbar via MINDNET_LLM_VALIDATION_HEADERS (komma-separiert)
+def get_llm_validation_zone_headers() -> List[str]:
+    """
+    Lädt die konfigurierten Header-Namen für LLM-Validierungs-Zonen.
+    Fallback auf Defaults, falls nicht konfiguriert.
+    """
+    import os
+    headers_env = os.getenv(
+        "MINDNET_LLM_VALIDATION_HEADERS",
+        "Unzugeordnete Kanten,Edge Pool,Candidates"
+    )
+    header_list = [h.strip() for h in headers_env.split(",") if h.strip()]
+    # Fallback auf Defaults, falls leer
+    if not header_list:
+        header_list = [
+            "Unzugeordnete Kanten",
+            "Edge Pool",
+            "Candidates"
+        ]
+    return header_list
+
 def extract_note_scope_zones(markdown_body: str) -> List[Tuple[str, str]]:
    """
    WP-24c v4.2.0: Extrahiert Note-Scope Zonen aus Markdown.
+    WP-24c v4.5.6: Unterscheidet zwischen Note-Scope-Zonen und LLM-Validierungs-Zonen.
    
    Identifiziert Sektionen mit spezifischen Headern (konfigurierbar via .env)
    und extrahiert alle darin enthaltenen Links.
@ -93,21 +116,30 @@ def extract_note_scope_zones(markdown_body: str) -> List[Tuple[str, str]]:
    in_zone = False
    zone_content = []
    
+    # WP-24c v4.5.6: Lade beide Header-Listen für Unterscheidung
+    zone_headers = get_note_scope_zone_headers()
+    llm_validation_headers = get_llm_validation_zone_headers()
+    
    for i, line in enumerate(lines):
        # Prüfe auf Header
        header_match = re.match(header_pattern, line.strip())
        if header_match:
            header_text = header_match.group(1).strip()
            
-            # Prüfe, ob dieser Header eine Note-Scope Zone ist
-            # WP-24c v4.2.0: Dynamisches Laden der konfigurierten Header
-            zone_headers = get_note_scope_zone_headers()
+            # WP-24c v4.5.6: Prüfe, ob dieser Header eine Note-Scope Zone ist
+            # (NICHT eine LLM-Validierungs-Zone - diese werden separat behandelt)
            is_zone_header = any(
                header_text.lower() == zone_header.lower() 
                for zone_header in zone_headers
            )
            
-            if is_zone_header:
+            # WP-24c v4.5.6: Ignoriere LLM-Validierungs-Zonen hier (werden separat verarbeitet)
+            is_llm_validation = any(
+                header_text.lower() == llm_header.lower()
+                for llm_header in llm_validation_headers
+            )
+            
+            if is_zone_header and not is_llm_validation:
                in_zone = True
                zone_content = []
                continue
@ -143,6 +175,90 @@ def extract_note_scope_zones(markdown_body: str) -> List[Tuple[str, str]]:
    
    return edges

+def extract_llm_validation_zones(markdown_body: str) -> List[Tuple[str, str]]:
+    """
+    WP-24c v4.5.6: Extrahiert LLM-Validierungs-Zonen aus Markdown.
+    
+    Identifiziert Sektionen mit LLM-Validierungs-Headern (konfigurierbar via .env)
+    und extrahiert alle darin enthaltenen Links (Wikilinks, Typed Relations, Callouts).
+    Diese Kanten erhalten das Präfix "candidate:" in der rule_id.
+    
+    Returns:
+        List[Tuple[str, str]]: Liste von (kind, target) Tupeln
+    """
+    if not markdown_body:
+        return []
+    
+    edges: List[Tuple[str, str]] = []
+    
+    # WP-24c v4.5.6: Konfigurierbare Header-Ebene für LLM-Validierung
+    import os
+    import re
+    llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
+    header_level_pattern = "#" * llm_validation_level
+    
+    # Regex für Header-Erkennung (konfigurierbare Ebene)
+    header_pattern = rf'^{re.escape(header_level_pattern)}\s+(.+?)$'
+    
+    lines = markdown_body.split('\n')
+    in_zone = False
+    zone_content = []
+    
+    # WP-24c v4.5.6: Lade LLM-Validierungs-Header
+    llm_validation_headers = get_llm_validation_zone_headers()
+    
+    for i, line in enumerate(lines):
+        # Prüfe auf Header
+        header_match = re.match(header_pattern, line.strip())
+        if header_match:
+            header_text = header_match.group(1).strip()
+            
+            # WP-24c v4.5.6: Prüfe, ob dieser Header eine LLM-Validierungs-Zone ist
+            is_llm_validation = any(
+                header_text.lower() == llm_header.lower()
+                for llm_header in llm_validation_headers
+            )
+            
+            if is_llm_validation:
+                in_zone = True
+                zone_content = []
+                continue
+            else:
+                # Neuer Header gefunden, der keine Zone ist -> Zone beendet
+                if in_zone:
+                    # Verarbeite gesammelten Inhalt
+                    zone_text = '\n'.join(zone_content)
+                    # Extrahiere Typed Relations
+                    typed, _ = extract_typed_relations(zone_text)
+                    edges.extend(typed)
+                    # Extrahiere Wikilinks (als related_to)
+                    wikilinks = extract_wikilinks(zone_text)
+                    for wl in wikilinks:
+                        edges.append(("related_to", wl))
+                    # WP-24c v4.5.6: Extrahiere auch Callouts aus LLM-Validierungs-Zonen
+                    callout_pairs, _ = extract_callout_relations(zone_text)
+                    edges.extend(callout_pairs)
+                in_zone = False
+                zone_content = []
+        
+        # Sammle Inhalt, wenn wir in einer Zone sind
+        if in_zone:
+            zone_content.append(line)
+    
+    # Verarbeite letzte Zone (falls am Ende des Dokuments)
+    if in_zone and zone_content:
+        zone_text = '\n'.join(zone_content)
+        typed, _ = extract_typed_relations(zone_text)
+        edges.extend(typed)
+        wikilinks = extract_wikilinks(zone_text)
+        for wl in wikilinks:
+            edges.append(("related_to", wl))
+        # WP-24c v4.5.6: Extrahiere auch Callouts aus LLM-Validierungs-Zonen
+        callout_pairs, _ = extract_callout_relations(zone_text)
+        edges.extend(callout_pairs)
+    
+    return edges
+
 def extract_callouts_from_markdown(
    markdown_body: str, 
    note_id: str,
@ -249,7 +365,9 @@ def build_edges_for_note(
    note_type = _get(chunks[0], "type") if chunks else "concept"
    
    # WP-24c v4.2.0: Note-Scope Zonen Extraktion (VOR Chunk-Verarbeitung)
+    # WP-24c v4.5.6: Separate Behandlung von LLM-Validierungs-Zonen
    note_scope_edges: List[dict] = []
+    llm_validation_edges: List[dict] = []
    
    if markdown_body:
        # 1. Note-Scope Zonen (Wikilinks und Typed Relations)
@ -279,6 +397,55 @@ def build_edges_for_note(
                note_id=note_id,
                extra=payload
            ))
+        
+        # WP-24c v4.5.6: LLM-Validierungs-Zonen (mit candidate: Präfix)
+        llm_validation_links = extract_llm_validation_zones(markdown_body)
+        for kind, raw_target in llm_validation_links:
+            target, sec = parse_link_target(raw_target, note_id)
+            if not target:
+                continue
+            
+            # WP-24c v4.5.6: LLM-Validierungs-Kanten mit scope: "note" und rule_id: "candidate:..."
+            # Diese werden gegen alle Chunks der Note geprüft
+            # Bestimme Provenance basierend auf Link-Typ
+            if kind == "related_to":
+                # Wikilink in LLM-Validierungs-Zone
+                provenance = "explicit:wikilink"
+            else:
+                # Typed Relation oder Callout in LLM-Validierungs-Zone
+                provenance = "explicit"
+            
+            payload = {
+                "edge_id": _mk_edge_id(kind, note_id, target, "note", target_section=sec),
+                "provenance": provenance,
+                "rule_id": f"candidate:{provenance}",  # WP-24c v4.5.6: Zonen-Priorität - candidate: Präfix
+                "confidence": PROVENANCE_PRIORITY.get(provenance, 0.90)
+            }
+            if sec:
+                payload["target_section"] = sec
+            
+            llm_validation_edges.append(_edge(
+                kind=kind,
+                scope="note",
+                source_id=note_id,  # WP-24c v4.5.6: source_id = note_id (Note-Scope für LLM-Validierung)
+                target_id=target,
+                note_id=note_id,
+                extra=payload
+            ))
+            
+            # WP-24c v4.5.6: Füge Callouts aus LLM-Validierungs-Zonen zu all_chunk_callout_keys hinzu
+            # damit sie nicht im globalen Scan doppelt verarbeitet werden
+            # (Nur für Callouts, nicht für Wikilinks oder Typed Relations)
+            # Callouts werden in extract_llm_validation_zones bereits extrahiert
+            # und müssen daher aus dem globalen Scan ausgeschlossen werden
+            # Hinweis: extract_llm_validation_zones gibt auch Callouts zurück (als (kind, target) Tupel)
+            # Daher müssen wir prüfen, ob es sich um einen Callout handelt
+            # (Callouts haben typischerweise spezifische kinds wie "depends_on", "related_to", etc.)
+            # Für jetzt nehmen wir an, dass alle Links aus LLM-Validierungs-Zonen als "bereits verarbeitet" markiert werden
+            # Dies verhindert Duplikate im globalen Scan
+            callout_key = (kind, target, sec)
+            all_chunk_callout_keys.add(callout_key)
+            logger.debug(f"Note [{note_id}]: LLM-Validierungs-Zone Callout-Key hinzugefügt: ({kind}, {target}, {sec})")

    # 1) Struktur-Kanten (Internal: belongs_to, next/prev)
    # Diese erhalten die Provenienz 'structure' und sind in der Registry geschützt.
@ -400,11 +567,23 @@ def build_edges_for_note(
            if t:
                # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
                # WP-24c v4.3.1: explicit:callout erhält Confidence 1.0 für Präzisions-Priorität
+                # WP-24c v4.5.6: candidate: Präfix NUR für global_pool (aus LLM-Validierungs-Zonen)
+                # Normale Callouts im Fließtext erhalten KEIN candidate: Präfix
                confidence = 1.0 if p == "explicit:callout" else PROVENANCE_PRIORITY.get(p, 0.90)
+                
+                # WP-24c v4.5.6: rule_id nur mit candidate: für global_pool (LLM-Validierungs-Zonen)
+                # explicit:callout (normale Callouts im Fließtext) erhalten KEIN candidate: Präfix
+                if p == "global_pool":
+                    rule_id = f"candidate:{p}"
+                elif p == "explicit:callout":
+                    rule_id = "explicit:callout"  # WP-24c v4.5.6: Kein candidate: für Fließtext-Callouts
+                else:
+                    rule_id = p  # Andere Provenances ohne candidate:
+                
                payload = {
                    "chunk_id": cid, 
                    "edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
-                    "provenance": p, "rule_id": f"candidate:{p}", "confidence": confidence
+                    "provenance": p, "rule_id": rule_id, "confidence": confidence
                }
                if sec: payload["target_section"] = sec
                edges.append(_edge(k, "chunk", cid, t, note_id, payload))
@ -483,7 +662,10 @@ def build_edges_for_note(
            }))

    # 4) WP-24c v4.2.0: Note-Scope Edges hinzufügen (VOR De-Duplizierung)
+    # WP-24c v4.2.0: Note-Scope Edges hinzufügen
    edges.extend(note_scope_edges)
+    # WP-24c v4.5.6: LLM-Validierungs-Edges hinzufügen (mit candidate: Präfix)
+    edges.extend(llm_validation_edges)
    
    # 5) WP-24c v4.2.9 Fix B PHASE 2 (Deduplizierung): Callout-Extraktion aus Markdown
    # Der globale Scan des markdown_body nutzt all_chunk_callout_keys als Ausschlusskriterium.